Coverage for src / local_deep_research / database / models / library.py: 99%
306 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Library and document models - Unified architecture.
3All documents (research downloads and user uploads) are stored in one table.
4Collections organize documents, with "Library" as the default collection.
5"""
7import enum
9from sqlalchemy import (
10 JSON,
11 Boolean,
12 Column,
13 Date,
14 Enum,
15 ForeignKey,
16 Index,
17 Integer,
18 LargeBinary,
19 String,
20 Text,
21 UniqueConstraint,
22)
23from sqlalchemy.orm import backref, relationship
24from sqlalchemy_utc import UtcDateTime, utcnow
26from .base import Base
29class RAGIndexStatus(enum.Enum):
30 """Status values for RAG indices."""
32 ACTIVE = "active"
33 REBUILDING = "rebuilding"
34 DEPRECATED = "deprecated"
37class DocumentStatus(enum.Enum):
38 """Status values for document processing and downloads."""
40 PENDING = "pending"
41 PROCESSING = "processing"
42 COMPLETED = "completed"
43 FAILED = "failed"
46class EmbeddingProvider(enum.Enum):
47 """Embedding model provider types."""
49 SENTENCE_TRANSFORMERS = "sentence_transformers"
50 OLLAMA = "ollama"
53class ExtractionMethod(str, enum.Enum):
54 """Methods used to extract text from documents."""
56 PDF_EXTRACTION = "pdf_extraction"
57 NATIVE_API = "native_api"
58 UNKNOWN = "unknown"
61class ExtractionSource(str, enum.Enum):
62 """Sources used for text extraction."""
64 ARXIV_API = "arxiv_api"
65 PUBMED_API = "pubmed_api"
66 PDFPLUMBER = "pdfplumber"
67 PDFPLUMBER_FALLBACK = "pdfplumber_fallback"
68 LOCAL_PDF = "local_pdf"
69 LEGACY_FILE = "legacy_file"
72class ExtractionQuality(str, enum.Enum):
73 """Quality levels for extracted text."""
75 HIGH = "high"
76 MEDIUM = "medium"
77 LOW = "low"
80class DistanceMetric(str, enum.Enum):
81 """Distance metrics for vector similarity search."""
83 COSINE = "cosine"
84 L2 = "l2"
85 DOT_PRODUCT = "dot_product"
88class IndexType(str, enum.Enum):
89 """FAISS index types for RAG."""
91 FLAT = "flat"
92 HNSW = "hnsw"
93 IVF = "ivf"
96class SplitterType(str, enum.Enum):
97 """Text splitter types for chunking."""
99 RECURSIVE = "recursive"
100 SEMANTIC = "semantic"
101 TOKEN = "token"
102 SENTENCE = "sentence"
105class PDFStorageMode(str, enum.Enum):
106 """Storage modes for PDF files."""
108 NONE = "none" # Don't store PDFs, text-only
109 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem
110 DATABASE = "database" # Store PDFs encrypted in database
113class SourceType(Base):
114 """
115 Document source types (research_download, user_upload, manual_entry, etc.).
116 Normalized table for consistent categorization.
117 """
119 __tablename__ = "source_types"
121 id = Column(String(36), primary_key=True) # UUID
122 name = Column(String(50), nullable=False, unique=True, index=True)
123 display_name = Column(String(100), nullable=False)
124 description = Column(Text)
125 icon = Column(String(50)) # Icon name for UI
127 # Timestamps
128 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
130 def __repr__(self):
131 return (
132 f"<SourceType(name='{self.name}', display='{self.display_name}')>"
133 )
136class UploadBatch(Base):
137 """
138 Tracks batches of user-uploaded files.
139 Groups uploads for traceability and batch operations.
140 """
142 __tablename__ = "upload_batches"
144 id = Column(String(36), primary_key=True) # UUID
145 collection_id = Column(
146 String(36),
147 ForeignKey("collections.id", ondelete="CASCADE"),
148 nullable=False,
149 index=True,
150 )
151 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False)
152 file_count = Column(Integer, default=0)
153 total_size = Column(Integer, default=0) # Total bytes
155 # Relationships
156 collection = relationship("Collection", backref="upload_batches")
157 documents = relationship("Document", backref="upload_batch")
159 def __repr__(self):
160 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>"
163class Document(Base):
164 """
165 Unified document table for all documents (research downloads + user uploads).
166 """
168 __tablename__ = "documents"
170 id = Column(String(36), primary_key=True) # UUID as string
172 # Source type (research_download, user_upload, etc.)
173 source_type_id = Column(
174 String(36),
175 ForeignKey("source_types.id"),
176 nullable=False,
177 index=True,
178 )
180 # Link to original research resource (for research downloads) - nullable for uploads
181 resource_id = Column(
182 Integer,
183 ForeignKey("research_resources.id", ondelete="SET NULL"),
184 nullable=True,
185 index=True,
186 )
188 # Link to research (for research downloads) - nullable for uploads
189 research_id = Column(
190 String(36),
191 ForeignKey("research_history.id", ondelete="CASCADE"),
192 nullable=True,
193 index=True,
194 )
196 # Link to upload batch (for user uploads) - nullable for research downloads
197 upload_batch_id = Column(
198 String(36),
199 ForeignKey("upload_batches.id", ondelete="SET NULL"),
200 nullable=True,
201 index=True,
202 )
204 # Document identification
205 document_hash = Column(
206 String(64), nullable=False, unique=True, index=True
207 ) # SHA256 for deduplication
208 original_url = Column(Text, nullable=True) # Source URL (for downloads)
209 filename = Column(String(500), nullable=True) # Display name (for uploads)
210 original_filename = Column(
211 String(500), nullable=True
212 ) # Original upload name
214 # File information
215 file_path = Column(
216 Text, nullable=True
217 ) # Path relative to library/uploads root
218 file_size = Column(Integer, nullable=False) # Size in bytes
219 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc.
220 mime_type = Column(String(100), nullable=True) # MIME type
222 # Content storage - text always stored in DB
223 text_content = Column(
224 Text, nullable=True
225 ) # Extracted/uploaded text content
227 # PDF storage mode (none, filesystem, database)
228 storage_mode = Column(
229 String(20), nullable=True, default="database"
230 ) # PDFStorageMode value
232 # Metadata
233 title = Column(Text) # Document title
234 description = Column(Text) # User description
235 authors = Column(JSON) # List of authors (for research papers)
236 published_date = Column(Date, nullable=True) # Publication date
238 # Academic identifiers (for research papers)
239 doi = Column(String(255), nullable=True, index=True)
240 arxiv_id = Column(String(100), nullable=True, index=True)
241 pmid = Column(String(50), nullable=True, index=True)
242 pmcid = Column(String(50), nullable=True, index=True)
243 isbn = Column(String(20), nullable=True)
245 # Download/Upload information
246 status = Column(
247 Enum(
248 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
249 ),
250 nullable=False,
251 default=DocumentStatus.COMPLETED,
252 )
253 attempts = Column(Integer, default=1)
254 error_message = Column(Text, nullable=True)
255 processed_at = Column(UtcDateTime, nullable=False, default=utcnow())
256 last_accessed = Column(UtcDateTime, nullable=True)
258 # Text extraction metadata (for research downloads from PDFs)
259 extraction_method = Column(
260 String(50), nullable=True
261 ) # pdf_extraction, native_api, etc.
262 extraction_source = Column(
263 String(50), nullable=True
264 ) # arxiv_api, pdfplumber, etc.
265 extraction_quality = Column(String(20), nullable=True) # high, medium, low
266 has_formatting_issues = Column(Boolean, default=False)
267 has_encoding_issues = Column(Boolean, default=False)
268 character_count = Column(Integer, nullable=True)
269 word_count = Column(Integer, nullable=True)
271 # Organization
272 tags = Column(JSON) # User-defined tags
273 notes = Column(Text) # User notes
274 favorite = Column(Boolean, default=False)
276 # Timestamps
277 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
278 updated_at = Column(
279 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
280 )
282 # Relationships
283 source_type = relationship("SourceType", backref="documents")
284 resource = relationship(
285 "ResearchResource",
286 foreign_keys="[Document.resource_id]",
287 backref="documents",
288 )
289 research = relationship("ResearchHistory", backref="documents")
290 collections = relationship(
291 "DocumentCollection",
292 back_populates="document",
293 cascade="all, delete-orphan",
294 )
296 # Indexes for efficient queries
297 __table_args__ = (
298 Index("idx_source_type", "source_type_id", "status"),
299 Index("idx_research_documents", "research_id", "status"),
300 Index("idx_document_type", "file_type", "status"),
301 Index("idx_document_hash", "document_hash"),
302 )
304 def __repr__(self):
305 title_str = (
306 self.title[:50]
307 if self.title
308 else (self.filename[:50] if self.filename else "Untitled")
309 )
310 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>"
313class DocumentBlob(Base):
314 """
315 Separate table for storing PDF binary content.
316 SQLite best practices: keep BLOBs in separate table for better query performance.
317 Stored in encrypted SQLCipher database for security.
318 """
320 __tablename__ = "document_blobs"
322 # Primary key references Document.id
323 document_id = Column(
324 String(36),
325 ForeignKey("documents.id", ondelete="CASCADE"),
326 primary_key=True,
327 nullable=False,
328 )
330 # Binary PDF content
331 pdf_binary = Column(LargeBinary, nullable=False)
333 # Hash for integrity verification
334 blob_hash = Column(String(64), nullable=True, index=True) # SHA256
336 # Timestamps
337 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False)
338 last_accessed = Column(UtcDateTime, nullable=True)
340 # Relationship
341 document = relationship(
342 "Document",
343 backref=backref("blob", passive_deletes=True),
344 passive_deletes=True,
345 )
347 def __repr__(self):
348 size = len(self.pdf_binary) if self.pdf_binary else 0
349 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>"
352class Collection(Base):
353 """
354 Collections for organizing documents.
355 'Library' is the default collection for research downloads.
356 Users can create custom collections for organization.
357 """
359 __tablename__ = "collections"
361 id = Column(String(36), primary_key=True) # UUID as string
362 name = Column(String(255), nullable=False)
363 description = Column(Text)
365 # Collection type (default_library, user_collection, linked_folder)
366 collection_type = Column(String(50), default="user_collection")
368 # Is this the default library collection?
369 is_default = Column(Boolean, default=False)
371 # Embedding model used for this collection (stored when first indexed)
372 embedding_model = Column(
373 String(100), nullable=True
374 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest'
375 embedding_model_type = Column(
376 Enum(
377 EmbeddingProvider,
378 values_callable=lambda obj: [e.value for e in obj],
379 ),
380 nullable=True,
381 )
382 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
383 chunk_size = Column(Integer, nullable=True) # Chunk size used
384 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used
386 # Advanced embedding configuration options (Issue #1054)
387 splitter_type = Column(
388 String(50), nullable=True
389 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
390 text_separators = Column(
391 JSON, nullable=True
392 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
393 distance_metric = Column(
394 String(50), nullable=True
395 ) # Distance metric: 'cosine', 'l2', 'dot_product'
396 normalize_vectors = Column(
397 Boolean, nullable=True
398 ) # Whether to normalize embeddings with L2
399 index_type = Column(
400 String(50), nullable=True
401 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
403 # Timestamps
404 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
405 updated_at = Column(
406 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
407 )
409 # Relationships
410 document_links = relationship(
411 "DocumentCollection",
412 back_populates="collection",
413 cascade="all, delete-orphan",
414 )
415 linked_folders = relationship(
416 "CollectionFolder",
417 back_populates="collection",
418 cascade="all, delete-orphan",
419 )
421 def __repr__(self):
422 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>"
425class DocumentCollection(Base):
426 """
427 Many-to-many relationship between documents and collections.
428 Tracks indexing status per collection (documents can be in multiple collections).
429 """
431 __tablename__ = "document_collections"
433 id = Column(Integer, primary_key=True, autoincrement=True)
435 # Foreign keys
436 document_id = Column(
437 String(36),
438 ForeignKey("documents.id", ondelete="CASCADE"),
439 nullable=False,
440 index=True,
441 )
442 collection_id = Column(
443 String(36),
444 ForeignKey("collections.id", ondelete="CASCADE"),
445 nullable=False,
446 index=True,
447 )
449 # Indexing status (per collection!)
450 indexed = Column(
451 Boolean, default=False
452 ) # Whether indexed for this collection
453 chunk_count = Column(
454 Integer, default=0
455 ) # Number of chunks in this collection
456 last_indexed_at = Column(UtcDateTime, nullable=True)
458 # Timestamps
459 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)
461 # Relationships
462 document = relationship("Document", back_populates="collections")
463 collection = relationship("Collection", back_populates="document_links")
465 # Ensure one entry per document-collection pair
466 __table_args__ = (
467 UniqueConstraint(
468 "document_id", "collection_id", name="uix_document_collection"
469 ),
470 Index("idx_collection_indexed", "collection_id", "indexed"),
471 )
473 def __repr__(self):
474 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>"
477class DocumentChunk(Base):
478 """
479 Universal chunk storage for RAG across all sources.
480 Stores text chunks in encrypted database for semantic search.
481 """
483 __tablename__ = "document_chunks"
485 id = Column(Integer, primary_key=True, autoincrement=True)
487 # Chunk identification
488 chunk_hash = Column(
489 String(64), nullable=False, index=True
490 ) # SHA256 for deduplication
492 # Source tracking - now points to unified Document table
493 source_type = Column(
494 String(20), nullable=False, index=True
495 ) # 'document', 'folder_file'
496 source_id = Column(
497 String(36), nullable=True, index=True
498 ) # Document.id (UUID as string)
499 source_path = Column(
500 Text, nullable=True
501 ) # File path if local collection source
502 collection_name = Column(
503 String(100), nullable=False, index=True
504 ) # collection_<uuid>
506 # Chunk content (encrypted in SQLCipher DB)
507 chunk_text = Column(Text, nullable=False) # The actual chunk text
508 chunk_index = Column(Integer, nullable=False) # Position in source document
509 start_char = Column(Integer, nullable=False) # Start character position
510 end_char = Column(Integer, nullable=False) # End character position
511 word_count = Column(Integer, nullable=False) # Number of words in chunk
513 # Embedding metadata
514 embedding_id = Column(
515 String(36), nullable=False, unique=True, index=True
516 ) # UUID for FAISS vector mapping
517 embedding_model = Column(
518 String(100), nullable=False
519 ) # e.g., 'all-MiniLM-L6-v2'
520 embedding_model_type = Column(
521 Enum(
522 EmbeddingProvider,
523 values_callable=lambda obj: [e.value for e in obj],
524 ),
525 nullable=False,
526 )
527 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
529 # Document metadata (for context)
530 document_title = Column(Text, nullable=True) # Title of source document
531 document_metadata = Column(
532 JSON, nullable=True
533 ) # Additional metadata from source
535 # Timestamps
536 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
537 last_accessed = Column(UtcDateTime, nullable=True)
539 # Indexes for efficient queries
540 __table_args__ = (
541 UniqueConstraint(
542 "chunk_hash", "collection_name", name="uix_chunk_collection"
543 ),
544 Index("idx_chunk_source", "source_type", "source_id"),
545 Index("idx_chunk_collection", "collection_name", "created_at"),
546 Index("idx_chunk_embedding", "embedding_id"),
547 )
549 def __repr__(self):
550 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>"
553class DownloadQueue(Base):
554 """
555 Queue for pending document downloads.
556 Renamed from LibraryDownloadQueue for consistency.
557 """
559 __tablename__ = "download_queue"
561 id = Column(Integer, primary_key=True, autoincrement=True)
563 # What to download
564 resource_id = Column(
565 Integer,
566 ForeignKey("research_resources.id", ondelete="CASCADE"),
567 nullable=False,
568 unique=True, # One queue entry per resource
569 )
570 research_id = Column(String(36), nullable=False, index=True)
572 # Target collection (defaults to Library collection)
573 collection_id = Column(
574 String(36),
575 ForeignKey("collections.id", ondelete="SET NULL"),
576 nullable=True,
577 index=True,
578 )
580 # Queue management
581 priority = Column(Integer, default=0) # Higher = more important
582 status = Column(
583 Enum(
584 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
585 ),
586 nullable=False,
587 default=DocumentStatus.PENDING,
588 )
589 attempts = Column(Integer, default=0)
590 max_attempts = Column(Integer, default=3)
592 # Error tracking
593 last_error = Column(Text, nullable=True)
594 last_attempt_at = Column(UtcDateTime, nullable=True)
596 # Timestamps
597 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False)
598 completed_at = Column(UtcDateTime, nullable=True)
600 # Relationships
601 resource = relationship("ResearchResource", backref="download_queue")
602 collection = relationship("Collection", backref="download_queue_items")
604 def __repr__(self):
605 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>"
608class LibraryStatistics(Base):
609 """
610 Aggregate statistics for the library.
611 Updated periodically for dashboard display.
612 """
614 __tablename__ = "library_statistics"
616 id = Column(Integer, primary_key=True, autoincrement=True)
618 # Document counts
619 total_documents = Column(Integer, default=0)
620 total_pdfs = Column(Integer, default=0)
621 total_html = Column(Integer, default=0)
622 total_other = Column(Integer, default=0)
624 # Storage metrics
625 total_size_bytes = Column(Integer, default=0)
626 average_document_size = Column(Integer, default=0)
628 # Research metrics
629 total_researches_with_downloads = Column(Integer, default=0)
630 average_documents_per_research = Column(Integer, default=0)
632 # Download metrics
633 total_download_attempts = Column(Integer, default=0)
634 successful_downloads = Column(Integer, default=0)
635 failed_downloads = Column(Integer, default=0)
636 pending_downloads = Column(Integer, default=0)
638 # Academic sources breakdown
639 arxiv_count = Column(Integer, default=0)
640 pubmed_count = Column(Integer, default=0)
641 doi_count = Column(Integer, default=0)
642 other_count = Column(Integer, default=0)
644 # Timestamps
645 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False)
647 def __repr__(self):
648 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>"
651class RAGIndex(Base):
652 """
653 Tracks FAISS indices for RAG collections.
654 Each collection+embedding_model combination has its own FAISS index.
655 """
657 __tablename__ = "rag_indices"
659 id = Column(Integer, primary_key=True, autoincrement=True)
661 # Collection and model identification
662 collection_name = Column(
663 String(100), nullable=False, index=True
664 ) # 'collection_<uuid>'
665 embedding_model = Column(
666 String(100), nullable=False
667 ) # e.g., 'all-MiniLM-L6-v2'
668 embedding_model_type = Column(
669 Enum(
670 EmbeddingProvider,
671 values_callable=lambda obj: [e.value for e in obj],
672 ),
673 nullable=False,
674 )
675 embedding_dimension = Column(Integer, nullable=False) # Vector dimension
677 # Index file location
678 index_path = Column(Text, nullable=False) # Path to .faiss file
679 index_hash = Column(
680 String(64), nullable=False, unique=True, index=True
681 ) # SHA256 of collection+model for uniqueness
683 # Chunking parameters used
684 chunk_size = Column(Integer, nullable=False)
685 chunk_overlap = Column(Integer, nullable=False)
687 # Advanced embedding configuration options (Issue #1054)
688 splitter_type = Column(
689 String(50), nullable=True
690 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
691 text_separators = Column(
692 JSON, nullable=True
693 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
694 distance_metric = Column(
695 String(50), nullable=True
696 ) # Distance metric: 'cosine', 'l2', 'dot_product'
697 normalize_vectors = Column(
698 Boolean, nullable=True
699 ) # Whether to normalize embeddings with L2
700 index_type = Column(
701 String(50), nullable=True
702 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
704 # Index statistics
705 chunk_count = Column(Integer, default=0) # Number of chunks in this index
706 total_documents = Column(Integer, default=0) # Number of source documents
708 # Status
709 status = Column(
710 Enum(
711 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj]
712 ),
713 nullable=False,
714 default=RAGIndexStatus.ACTIVE,
715 )
716 is_current = Column(
717 Boolean, default=True
718 ) # Whether this is the current index for this collection
720 # Timestamps
721 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
722 last_updated_at = Column(
723 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
724 )
725 last_used_at = Column(
726 UtcDateTime, nullable=True
727 ) # Last time index was searched
729 # Ensure one active index per collection+model
730 __table_args__ = (
731 UniqueConstraint(
732 "collection_name",
733 "embedding_model",
734 "embedding_model_type",
735 name="uix_collection_model",
736 ),
737 Index("idx_collection_current", "collection_name", "is_current"),
738 )
740 def __repr__(self):
741 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>"
744class RagDocumentStatus(Base):
745 """
746 Tracks which documents have been indexed for RAG.
747 Row existence = document is indexed. No row = not indexed.
748 Simple and avoids ORM caching issues.
749 """
751 __tablename__ = "rag_document_status"
753 # Composite primary key
754 document_id = Column(
755 String(36),
756 ForeignKey("documents.id", ondelete="CASCADE"),
757 primary_key=True,
758 nullable=False,
759 )
760 collection_id = Column(
761 String(36),
762 ForeignKey("collections.id", ondelete="CASCADE"),
763 primary_key=True,
764 nullable=False,
765 )
767 # Which RAG index was used (tracks embedding model indirectly)
768 rag_index_id = Column(
769 Integer,
770 ForeignKey("rag_indices.id", ondelete="CASCADE"),
771 nullable=False,
772 index=True,
773 )
775 # Metadata
776 chunk_count = Column(Integer, nullable=False)
777 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow())
779 # Indexes for fast lookups
780 __table_args__ = (
781 Index("idx_rag_status_collection", "collection_id"),
782 Index("idx_rag_status_index", "rag_index_id"),
783 )
785 def __repr__(self):
786 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>"
789class CollectionFolder(Base):
790 """
791 Local folders linked to a collection for indexing.
792 """
794 __tablename__ = "collection_folders"
796 id = Column(Integer, primary_key=True, autoincrement=True)
798 # Collection association
799 collection_id = Column(
800 String(36),
801 ForeignKey("collections.id", ondelete="CASCADE"),
802 nullable=False,
803 index=True,
804 )
806 # Folder configuration
807 folder_path = Column(Text, nullable=False) # Absolute path to folder
808 include_patterns = Column(
809 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"]
810 ) # File patterns to include
811 exclude_patterns = Column(
812 JSON
813 ) # Patterns to exclude (e.g., ["**/node_modules/**"])
814 recursive = Column(Boolean, default=True) # Search subfolders
816 # Monitoring
817 watch_enabled = Column(
818 Boolean, default=False
819 ) # Auto-reindex on changes (future)
820 last_scanned_at = Column(UtcDateTime, nullable=True)
821 file_count = Column(Integer, default=0) # Total files found
822 indexed_file_count = Column(Integer, default=0) # Files indexed
824 # Timestamps
825 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
826 updated_at = Column(
827 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
828 )
830 # Relationships
831 collection = relationship("Collection", back_populates="linked_folders")
832 files = relationship(
833 "CollectionFolderFile",
834 back_populates="folder",
835 cascade="all, delete-orphan",
836 )
838 def __repr__(self):
839 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>"
842class CollectionFolderFile(Base):
843 """
844 Files found in linked folders.
845 Lightweight tracking for deduplication and indexing status.
846 """
848 __tablename__ = "collection_folder_files"
850 id = Column(Integer, primary_key=True, autoincrement=True)
852 # Folder association
853 folder_id = Column(
854 Integer,
855 ForeignKey("collection_folders.id", ondelete="CASCADE"),
856 nullable=False,
857 index=True,
858 )
860 # File identification
861 relative_path = Column(Text, nullable=False) # Path relative to folder_path
862 file_hash = Column(String(64), index=True) # SHA256 for deduplication
863 file_size = Column(Integer) # Size in bytes
864 file_type = Column(String(50)) # Extension
866 # File metadata
867 last_modified = Column(UtcDateTime) # File modification time
869 # Indexing status
870 indexed = Column(Boolean, default=False)
871 chunk_count = Column(Integer, default=0)
872 last_indexed_at = Column(UtcDateTime, nullable=True)
873 index_error = Column(Text, nullable=True) # Error if indexing failed
875 # Timestamps
876 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False)
877 updated_at = Column(
878 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
879 )
881 # Relationships
882 folder = relationship("CollectionFolder", back_populates="files")
884 # Ensure one entry per file in folder
885 __table_args__ = (
886 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"),
887 Index("idx_folder_indexed", "folder_id", "indexed"),
888 )
890 def __repr__(self):
891 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"