Coverage for src / local_deep_research / database / models / library.py: 95%
306 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Library and document models - Unified architecture.
3All documents (research downloads and user uploads) are stored in one table.
4Collections organize documents, with "Library" as the default collection.
5"""
7import enum
9from sqlalchemy import (
10 JSON,
11 Boolean,
12 Column,
13 Date,
14 Enum,
15 ForeignKey,
16 Index,
17 Integer,
18 LargeBinary,
19 String,
20 Text,
21 UniqueConstraint,
22)
23from sqlalchemy.orm import backref, relationship
24from sqlalchemy_utc import UtcDateTime, utcnow
26from .base import Base
29class RAGIndexStatus(enum.Enum):
30 """Status values for RAG indices."""
32 ACTIVE = "active"
33 REBUILDING = "rebuilding"
34 DEPRECATED = "deprecated"
37class DocumentStatus(enum.Enum):
38 """Status values for document processing and downloads."""
40 PENDING = "pending"
41 PROCESSING = "processing"
42 COMPLETED = "completed"
43 FAILED = "failed"
46class EmbeddingProvider(enum.Enum):
47 """Embedding model provider types."""
49 SENTENCE_TRANSFORMERS = "sentence_transformers"
50 OLLAMA = "ollama"
53class ExtractionMethod(str, enum.Enum):
54 """Methods used to extract text from documents."""
56 PDF_EXTRACTION = "pdf_extraction"
57 NATIVE_API = "native_api"
58 UNKNOWN = "unknown"
61class ExtractionSource(str, enum.Enum):
62 """Sources used for text extraction."""
64 ARXIV_API = "arxiv_api"
65 PUBMED_API = "pubmed_api"
66 PDFPLUMBER = "pdfplumber"
67 PDFPLUMBER_FALLBACK = "pdfplumber_fallback"
68 LOCAL_PDF = "local_pdf"
69 LEGACY_FILE = "legacy_file"
72class ExtractionQuality(str, enum.Enum):
73 """Quality levels for extracted text."""
75 HIGH = "high"
76 MEDIUM = "medium"
77 LOW = "low"
80class DistanceMetric(str, enum.Enum):
81 """Distance metrics for vector similarity search."""
83 COSINE = "cosine"
84 L2 = "l2"
85 DOT_PRODUCT = "dot_product"
88class IndexType(str, enum.Enum):
89 """FAISS index types for RAG."""
91 FLAT = "flat"
92 HNSW = "hnsw"
93 IVF = "ivf"
96class SplitterType(str, enum.Enum):
97 """Text splitter types for chunking."""
99 RECURSIVE = "recursive"
100 SEMANTIC = "semantic"
101 TOKEN = "token"
102 SENTENCE = "sentence"
105class PDFStorageMode(str, enum.Enum):
106 """Storage modes for PDF files."""
108 NONE = "none" # Don't store PDFs, text-only
109 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem
110 DATABASE = "database" # Store PDFs encrypted in database
113class SourceType(Base):
114 """
115 Document source types (research_download, user_upload, manual_entry, etc.).
116 Normalized table for consistent categorization.
117 """
119 __tablename__ = "source_types"
121 id = Column(String(36), primary_key=True) # UUID
122 name = Column(String(50), nullable=False, unique=True, index=True)
123 display_name = Column(String(100), nullable=False)
124 description = Column(Text)
125 icon = Column(String(50)) # Icon name for UI
127 # Timestamps
128 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
130 def __repr__(self):
131 return (
132 f"<SourceType(name='{self.name}', display='{self.display_name}')>"
133 )
136class UploadBatch(Base):
137 """
138 Tracks batches of user-uploaded files.
139 Groups uploads for traceability and batch operations.
140 """
142 __tablename__ = "upload_batches"
144 id = Column(String(36), primary_key=True) # UUID
145 collection_id = Column(
146 String(36),
147 ForeignKey("collections.id", ondelete="CASCADE"),
148 nullable=False,
149 index=True,
150 )
151 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False)
152 file_count = Column(Integer, default=0)
153 total_size = Column(Integer, default=0) # Total bytes
155 # Relationships
156 collection = relationship("Collection", backref="upload_batches")
157 documents = relationship("Document", backref="upload_batch")
159 def __repr__(self):
160 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>"
163class Document(Base):
164 """
165 Unified document table for all documents (research downloads + user uploads).
166 """
168 __tablename__ = "documents"
170 id = Column(String(36), primary_key=True) # UUID as string
172 # Source type (research_download, user_upload, etc.)
173 source_type_id = Column(
174 String(36),
175 ForeignKey("source_types.id"),
176 nullable=False,
177 index=True,
178 )
180 # Link to original research resource (for research downloads) - nullable for uploads
181 resource_id = Column(
182 Integer,
183 ForeignKey("research_resources.id", ondelete="CASCADE"),
184 nullable=True,
185 index=True,
186 )
188 # Link to research (for research downloads) - nullable for uploads
189 research_id = Column(
190 String(36),
191 ForeignKey("research_history.id", ondelete="CASCADE"),
192 nullable=True,
193 index=True,
194 )
196 # Link to upload batch (for user uploads) - nullable for research downloads
197 upload_batch_id = Column(
198 String(36),
199 ForeignKey("upload_batches.id", ondelete="SET NULL"),
200 nullable=True,
201 index=True,
202 )
204 # Document identification
205 document_hash = Column(
206 String(64), nullable=False, unique=True, index=True
207 ) # SHA256 for deduplication
208 original_url = Column(Text, nullable=True) # Source URL (for downloads)
209 filename = Column(String(500), nullable=True) # Display name (for uploads)
210 original_filename = Column(
211 String(500), nullable=True
212 ) # Original upload name
214 # File information
215 file_path = Column(
216 Text, nullable=True
217 ) # Path relative to library/uploads root
218 file_size = Column(Integer, nullable=False) # Size in bytes
219 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc.
220 mime_type = Column(String(100), nullable=True) # MIME type
222 # Content storage - text always stored in DB
223 text_content = Column(
224 Text, nullable=True
225 ) # Extracted/uploaded text content
227 # PDF storage mode (none, filesystem, database)
228 storage_mode = Column(
229 String(20), nullable=True, default="database"
230 ) # PDFStorageMode value
232 # Metadata
233 title = Column(Text) # Document title
234 description = Column(Text) # User description
235 authors = Column(JSON) # List of authors (for research papers)
236 published_date = Column(Date, nullable=True) # Publication date
238 # Academic identifiers (for research papers)
239 doi = Column(String(255), nullable=True, index=True)
240 arxiv_id = Column(String(100), nullable=True, index=True)
241 pmid = Column(String(50), nullable=True, index=True)
242 pmcid = Column(String(50), nullable=True, index=True)
243 isbn = Column(String(20), nullable=True)
245 # Download/Upload information
246 status = Column(
247 Enum(
248 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
249 ),
250 nullable=False,
251 default=DocumentStatus.COMPLETED,
252 )
253 attempts = Column(Integer, default=1)
254 error_message = Column(Text, nullable=True)
255 processed_at = Column(UtcDateTime, nullable=False, default=utcnow())
256 last_accessed = Column(UtcDateTime, nullable=True)
258 # Text extraction metadata (for research downloads from PDFs)
259 extraction_method = Column(
260 String(50), nullable=True
261 ) # pdf_extraction, native_api, etc.
262 extraction_source = Column(
263 String(50), nullable=True
264 ) # arxiv_api, pdfplumber, etc.
265 extraction_quality = Column(String(20), nullable=True) # high, medium, low
266 has_formatting_issues = Column(Boolean, default=False)
267 has_encoding_issues = Column(Boolean, default=False)
268 character_count = Column(Integer, nullable=True)
269 word_count = Column(Integer, nullable=True)
271 # Organization
272 tags = Column(JSON) # User-defined tags
273 notes = Column(Text) # User notes
274 favorite = Column(Boolean, default=False)
276 # Timestamps
277 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
278 updated_at = Column(
279 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
280 )
282 # Relationships
283 source_type = relationship("SourceType", backref="documents")
284 resource = relationship("ResearchResource", backref="documents")
285 research = relationship("ResearchHistory", backref="documents")
286 collections = relationship(
287 "DocumentCollection",
288 back_populates="document",
289 cascade="all, delete-orphan",
290 )
292 # Indexes for efficient queries
293 __table_args__ = (
294 Index("idx_source_type", "source_type_id", "status"),
295 Index("idx_research_documents", "research_id", "status"),
296 Index("idx_document_type", "file_type", "status"),
297 Index("idx_document_hash", "document_hash"),
298 )
300 def __repr__(self):
301 title_str = (
302 self.title[:50]
303 if self.title
304 else (self.filename[:50] if self.filename else "Untitled")
305 )
306 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>"
309class DocumentBlob(Base):
310 """
311 Separate table for storing PDF binary content.
312 SQLite best practices: keep BLOBs in separate table for better query performance.
313 Stored in encrypted SQLCipher database for security.
314 """
316 __tablename__ = "document_blobs"
318 # Primary key references Document.id
319 document_id = Column(
320 String(36),
321 ForeignKey("documents.id", ondelete="CASCADE"),
322 primary_key=True,
323 nullable=False,
324 )
326 # Binary PDF content
327 pdf_binary = Column(LargeBinary, nullable=False)
329 # Hash for integrity verification
330 blob_hash = Column(String(64), nullable=True, index=True) # SHA256
332 # Timestamps
333 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False)
334 last_accessed = Column(UtcDateTime, nullable=True)
336 # Relationship
337 document = relationship(
338 "Document",
339 backref=backref("blob", passive_deletes=True),
340 passive_deletes=True,
341 )
343 def __repr__(self):
344 size = len(self.pdf_binary) if self.pdf_binary else 0
345 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>"
348class Collection(Base):
349 """
350 Collections for organizing documents.
351 'Library' is the default collection for research downloads.
352 Users can create custom collections for organization.
353 """
355 __tablename__ = "collections"
357 id = Column(String(36), primary_key=True) # UUID as string
358 name = Column(String(255), nullable=False)
359 description = Column(Text)
361 # Collection type (default_library, user_collection, linked_folder)
362 collection_type = Column(String(50), default="user_collection")
364 # Is this the default library collection?
365 is_default = Column(Boolean, default=False)
367 # Embedding model used for this collection (stored when first indexed)
368 embedding_model = Column(
369 String(100), nullable=True
370 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest'
371 embedding_model_type = Column(
372 Enum(
373 EmbeddingProvider,
374 values_callable=lambda obj: [e.value for e in obj],
375 ),
376 nullable=True,
377 )
378 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
379 chunk_size = Column(Integer, nullable=True) # Chunk size used
380 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used
382 # Advanced embedding configuration options (Issue #1054)
383 splitter_type = Column(
384 String(50), nullable=True
385 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
386 text_separators = Column(
387 JSON, nullable=True
388 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
389 distance_metric = Column(
390 String(50), nullable=True
391 ) # Distance metric: 'cosine', 'l2', 'dot_product'
392 normalize_vectors = Column(
393 Boolean, nullable=True
394 ) # Whether to normalize embeddings with L2
395 index_type = Column(
396 String(50), nullable=True
397 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
399 # Timestamps
400 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
401 updated_at = Column(
402 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
403 )
405 # Relationships
406 document_links = relationship(
407 "DocumentCollection",
408 back_populates="collection",
409 cascade="all, delete-orphan",
410 )
411 linked_folders = relationship(
412 "CollectionFolder",
413 back_populates="collection",
414 cascade="all, delete-orphan",
415 )
417 def __repr__(self):
418 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>"
421class DocumentCollection(Base):
422 """
423 Many-to-many relationship between documents and collections.
424 Tracks indexing status per collection (documents can be in multiple collections).
425 """
427 __tablename__ = "document_collections"
429 id = Column(Integer, primary_key=True, autoincrement=True)
431 # Foreign keys
432 document_id = Column(
433 String(36),
434 ForeignKey("documents.id", ondelete="CASCADE"),
435 nullable=False,
436 index=True,
437 )
438 collection_id = Column(
439 String(36),
440 ForeignKey("collections.id", ondelete="CASCADE"),
441 nullable=False,
442 index=True,
443 )
445 # Indexing status (per collection!)
446 indexed = Column(
447 Boolean, default=False
448 ) # Whether indexed for this collection
449 chunk_count = Column(
450 Integer, default=0
451 ) # Number of chunks in this collection
452 last_indexed_at = Column(UtcDateTime, nullable=True)
454 # Timestamps
455 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)
457 # Relationships
458 document = relationship("Document", back_populates="collections")
459 collection = relationship("Collection", back_populates="document_links")
461 # Ensure one entry per document-collection pair
462 __table_args__ = (
463 UniqueConstraint(
464 "document_id", "collection_id", name="uix_document_collection"
465 ),
466 Index("idx_collection_indexed", "collection_id", "indexed"),
467 )
469 def __repr__(self):
470 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>"
473class DocumentChunk(Base):
474 """
475 Universal chunk storage for RAG across all sources.
476 Stores text chunks in encrypted database for semantic search.
477 """
479 __tablename__ = "document_chunks"
481 id = Column(Integer, primary_key=True, autoincrement=True)
483 # Chunk identification
484 chunk_hash = Column(
485 String(64), nullable=False, index=True
486 ) # SHA256 for deduplication
488 # Source tracking - now points to unified Document table
489 source_type = Column(
490 String(20), nullable=False, index=True
491 ) # 'document', 'folder_file'
492 source_id = Column(
493 String(36), nullable=True, index=True
494 ) # Document.id (UUID as string)
495 source_path = Column(
496 Text, nullable=True
497 ) # File path if local collection source
498 collection_name = Column(
499 String(100), nullable=False, index=True
500 ) # collection_<uuid>
502 # Chunk content (encrypted in SQLCipher DB)
503 chunk_text = Column(Text, nullable=False) # The actual chunk text
504 chunk_index = Column(Integer, nullable=False) # Position in source document
505 start_char = Column(Integer, nullable=False) # Start character position
506 end_char = Column(Integer, nullable=False) # End character position
507 word_count = Column(Integer, nullable=False) # Number of words in chunk
509 # Embedding metadata
510 embedding_id = Column(
511 String(36), nullable=False, unique=True, index=True
512 ) # UUID for FAISS vector mapping
513 embedding_model = Column(
514 String(100), nullable=False
515 ) # e.g., 'all-MiniLM-L6-v2'
516 embedding_model_type = Column(
517 Enum(
518 EmbeddingProvider,
519 values_callable=lambda obj: [e.value for e in obj],
520 ),
521 nullable=False,
522 )
523 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
525 # Document metadata (for context)
526 document_title = Column(Text, nullable=True) # Title of source document
527 document_metadata = Column(
528 JSON, nullable=True
529 ) # Additional metadata from source
531 # Timestamps
532 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
533 last_accessed = Column(UtcDateTime, nullable=True)
535 # Indexes for efficient queries
536 __table_args__ = (
537 UniqueConstraint(
538 "chunk_hash", "collection_name", name="uix_chunk_collection"
539 ),
540 Index("idx_chunk_source", "source_type", "source_id"),
541 Index("idx_chunk_collection", "collection_name", "created_at"),
542 Index("idx_chunk_embedding", "embedding_id"),
543 )
545 def __repr__(self):
546 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>"
549class DownloadQueue(Base):
550 """
551 Queue for pending document downloads.
552 Renamed from LibraryDownloadQueue for consistency.
553 """
555 __tablename__ = "download_queue"
557 id = Column(Integer, primary_key=True, autoincrement=True)
559 # What to download
560 resource_id = Column(
561 Integer,
562 ForeignKey("research_resources.id", ondelete="CASCADE"),
563 nullable=False,
564 unique=True, # One queue entry per resource
565 )
566 research_id = Column(String(36), nullable=False, index=True)
568 # Target collection (defaults to Library collection)
569 collection_id = Column(
570 String(36),
571 ForeignKey("collections.id", ondelete="SET NULL"),
572 nullable=True,
573 index=True,
574 )
576 # Queue management
577 priority = Column(Integer, default=0) # Higher = more important
578 status = Column(
579 Enum(
580 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
581 ),
582 nullable=False,
583 default=DocumentStatus.PENDING,
584 )
585 attempts = Column(Integer, default=0)
586 max_attempts = Column(Integer, default=3)
588 # Error tracking
589 last_error = Column(Text, nullable=True)
590 last_attempt_at = Column(UtcDateTime, nullable=True)
592 # Timestamps
593 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False)
594 completed_at = Column(UtcDateTime, nullable=True)
596 # Relationships
597 resource = relationship("ResearchResource", backref="download_queue")
598 collection = relationship("Collection", backref="download_queue_items")
600 def __repr__(self):
601 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>"
604class LibraryStatistics(Base):
605 """
606 Aggregate statistics for the library.
607 Updated periodically for dashboard display.
608 """
610 __tablename__ = "library_statistics"
612 id = Column(Integer, primary_key=True, autoincrement=True)
614 # Document counts
615 total_documents = Column(Integer, default=0)
616 total_pdfs = Column(Integer, default=0)
617 total_html = Column(Integer, default=0)
618 total_other = Column(Integer, default=0)
620 # Storage metrics
621 total_size_bytes = Column(Integer, default=0)
622 average_document_size = Column(Integer, default=0)
624 # Research metrics
625 total_researches_with_downloads = Column(Integer, default=0)
626 average_documents_per_research = Column(Integer, default=0)
628 # Download metrics
629 total_download_attempts = Column(Integer, default=0)
630 successful_downloads = Column(Integer, default=0)
631 failed_downloads = Column(Integer, default=0)
632 pending_downloads = Column(Integer, default=0)
634 # Academic sources breakdown
635 arxiv_count = Column(Integer, default=0)
636 pubmed_count = Column(Integer, default=0)
637 doi_count = Column(Integer, default=0)
638 other_count = Column(Integer, default=0)
640 # Timestamps
641 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False)
643 def __repr__(self):
644 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>"
647class RAGIndex(Base):
648 """
649 Tracks FAISS indices for RAG collections.
650 Each collection+embedding_model combination has its own FAISS index.
651 """
653 __tablename__ = "rag_indices"
655 id = Column(Integer, primary_key=True, autoincrement=True)
657 # Collection and model identification
658 collection_name = Column(
659 String(100), nullable=False, index=True
660 ) # 'collection_<uuid>'
661 embedding_model = Column(
662 String(100), nullable=False
663 ) # e.g., 'all-MiniLM-L6-v2'
664 embedding_model_type = Column(
665 Enum(
666 EmbeddingProvider,
667 values_callable=lambda obj: [e.value for e in obj],
668 ),
669 nullable=False,
670 )
671 embedding_dimension = Column(Integer, nullable=False) # Vector dimension
673 # Index file location
674 index_path = Column(Text, nullable=False) # Path to .faiss file
675 index_hash = Column(
676 String(64), nullable=False, unique=True, index=True
677 ) # SHA256 of collection+model for uniqueness
679 # Chunking parameters used
680 chunk_size = Column(Integer, nullable=False)
681 chunk_overlap = Column(Integer, nullable=False)
683 # Advanced embedding configuration options (Issue #1054)
684 splitter_type = Column(
685 String(50), nullable=True
686 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
687 text_separators = Column(
688 JSON, nullable=True
689 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
690 distance_metric = Column(
691 String(50), nullable=True
692 ) # Distance metric: 'cosine', 'l2', 'dot_product'
693 normalize_vectors = Column(
694 Boolean, nullable=True
695 ) # Whether to normalize embeddings with L2
696 index_type = Column(
697 String(50), nullable=True
698 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
700 # Index statistics
701 chunk_count = Column(Integer, default=0) # Number of chunks in this index
702 total_documents = Column(Integer, default=0) # Number of source documents
704 # Status
705 status = Column(
706 Enum(
707 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj]
708 ),
709 nullable=False,
710 default=RAGIndexStatus.ACTIVE,
711 )
712 is_current = Column(
713 Boolean, default=True
714 ) # Whether this is the current index for this collection
716 # Timestamps
717 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
718 last_updated_at = Column(
719 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
720 )
721 last_used_at = Column(
722 UtcDateTime, nullable=True
723 ) # Last time index was searched
725 # Ensure one active index per collection+model
726 __table_args__ = (
727 UniqueConstraint(
728 "collection_name",
729 "embedding_model",
730 "embedding_model_type",
731 name="uix_collection_model",
732 ),
733 Index("idx_collection_current", "collection_name", "is_current"),
734 )
736 def __repr__(self):
737 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>"
740class RagDocumentStatus(Base):
741 """
742 Tracks which documents have been indexed for RAG.
743 Row existence = document is indexed. No row = not indexed.
744 Simple and avoids ORM caching issues.
745 """
747 __tablename__ = "rag_document_status"
749 # Composite primary key
750 document_id = Column(
751 String(36),
752 ForeignKey("documents.id", ondelete="CASCADE"),
753 primary_key=True,
754 nullable=False,
755 )
756 collection_id = Column(
757 String(36),
758 ForeignKey("collections.id", ondelete="CASCADE"),
759 primary_key=True,
760 nullable=False,
761 )
763 # Which RAG index was used (tracks embedding model indirectly)
764 rag_index_id = Column(
765 Integer,
766 ForeignKey("rag_indices.id", ondelete="CASCADE"),
767 nullable=False,
768 index=True,
769 )
771 # Metadata
772 chunk_count = Column(Integer, nullable=False)
773 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow())
775 # Indexes for fast lookups
776 __table_args__ = (
777 Index("idx_rag_status_collection", "collection_id"),
778 Index("idx_rag_status_index", "rag_index_id"),
779 )
781 def __repr__(self):
782 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>"
785class CollectionFolder(Base):
786 """
787 Local folders linked to a collection for indexing.
788 """
790 __tablename__ = "collection_folders"
792 id = Column(Integer, primary_key=True, autoincrement=True)
794 # Collection association
795 collection_id = Column(
796 String(36),
797 ForeignKey("collections.id", ondelete="CASCADE"),
798 nullable=False,
799 index=True,
800 )
802 # Folder configuration
803 folder_path = Column(Text, nullable=False) # Absolute path to folder
804 include_patterns = Column(
805 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"]
806 ) # File patterns to include
807 exclude_patterns = Column(
808 JSON
809 ) # Patterns to exclude (e.g., ["**/node_modules/**"])
810 recursive = Column(Boolean, default=True) # Search subfolders
812 # Monitoring
813 watch_enabled = Column(
814 Boolean, default=False
815 ) # Auto-reindex on changes (future)
816 last_scanned_at = Column(UtcDateTime, nullable=True)
817 file_count = Column(Integer, default=0) # Total files found
818 indexed_file_count = Column(Integer, default=0) # Files indexed
820 # Timestamps
821 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
822 updated_at = Column(
823 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
824 )
826 # Relationships
827 collection = relationship("Collection", back_populates="linked_folders")
828 files = relationship(
829 "CollectionFolderFile",
830 back_populates="folder",
831 cascade="all, delete-orphan",
832 )
834 def __repr__(self):
835 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>"
838class CollectionFolderFile(Base):
839 """
840 Files found in linked folders.
841 Lightweight tracking for deduplication and indexing status.
842 """
844 __tablename__ = "collection_folder_files"
846 id = Column(Integer, primary_key=True, autoincrement=True)
848 # Folder association
849 folder_id = Column(
850 Integer,
851 ForeignKey("collection_folders.id", ondelete="CASCADE"),
852 nullable=False,
853 index=True,
854 )
856 # File identification
857 relative_path = Column(Text, nullable=False) # Path relative to folder_path
858 file_hash = Column(String(64), index=True) # SHA256 for deduplication
859 file_size = Column(Integer) # Size in bytes
860 file_type = Column(String(50)) # Extension
862 # File metadata
863 last_modified = Column(UtcDateTime) # File modification time
865 # Indexing status
866 indexed = Column(Boolean, default=False)
867 chunk_count = Column(Integer, default=0)
868 last_indexed_at = Column(UtcDateTime, nullable=True)
869 index_error = Column(Text, nullable=True) # Error if indexing failed
871 # Timestamps
872 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False)
873 updated_at = Column(
874 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
875 )
877 # Relationships
878 folder = relationship("CollectionFolder", back_populates="files")
880 # Ensure one entry per file in folder
881 __table_args__ = (
882 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"),
883 Index("idx_folder_indexed", "folder_id", "indexed"),
884 )
886 def __repr__(self):
887 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"