Coverage for src/local_deep_research/database/models/library.py: 99%
307 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Library and document models - Unified architecture.
3All documents (research downloads and user uploads) are stored in one table.
4Collections organize documents, with "Library" as the default collection.
5"""
7import enum
9from sqlalchemy import (
10 JSON,
11 Boolean,
12 Column,
13 Date,
14 Enum,
15 ForeignKey,
16 Index,
17 Integer,
18 LargeBinary,
19 String,
20 Text,
21 UniqueConstraint,
22)
23from sqlalchemy.orm import backref, relationship
24from sqlalchemy_utc import UtcDateTime, utcnow
26from .base import Base
29class RAGIndexStatus(enum.Enum):
30 """Status values for RAG indices."""
32 ACTIVE = "active"
33 REBUILDING = "rebuilding"
34 DEPRECATED = "deprecated"
37class DocumentStatus(enum.Enum):
38 """Status values for document processing and downloads."""
40 PENDING = "pending"
41 PROCESSING = "processing"
42 COMPLETED = "completed"
43 FAILED = "failed"
46class EmbeddingProvider(enum.Enum):
47 """Embedding model provider types.
49 OPENAI covers both the OpenAI cloud API and any OpenAI-compatible
50 endpoint (LM Studio, vLLM, llama.cpp server, etc.) — the underlying
51 provider class reads ``embeddings.openai.base_url`` to target a local
52 server when set, falling back to the OpenAI cloud when unset.
53 """
55 SENTENCE_TRANSFORMERS = "sentence_transformers"
56 OLLAMA = "ollama"
57 OPENAI = "openai"
60class ExtractionMethod(str, enum.Enum):
61 """Methods used to extract text from documents."""
63 PDF_EXTRACTION = "pdf_extraction"
64 NATIVE_API = "native_api"
65 UNKNOWN = "unknown"
68class ExtractionSource(str, enum.Enum):
69 """Sources used for text extraction."""
71 ARXIV_API = "arxiv_api"
72 PUBMED_API = "pubmed_api"
73 PDFPLUMBER = "pdfplumber"
74 PDFPLUMBER_FALLBACK = "pdfplumber_fallback"
75 LOCAL_PDF = "local_pdf"
76 LEGACY_FILE = "legacy_file"
79class ExtractionQuality(str, enum.Enum):
80 """Quality levels for extracted text."""
82 HIGH = "high"
83 MEDIUM = "medium"
84 LOW = "low"
87class DistanceMetric(str, enum.Enum):
88 """Distance metrics for vector similarity search."""
90 COSINE = "cosine"
91 L2 = "l2"
92 DOT_PRODUCT = "dot_product"
95class IndexType(str, enum.Enum):
96 """FAISS index types for RAG."""
98 FLAT = "flat"
99 HNSW = "hnsw"
100 IVF = "ivf"
103class SplitterType(str, enum.Enum):
104 """Text splitter types for chunking."""
106 RECURSIVE = "recursive"
107 SEMANTIC = "semantic"
108 TOKEN = "token"
109 SENTENCE = "sentence"
112class PDFStorageMode(str, enum.Enum):
113 """Storage modes for PDF files."""
115 NONE = "none" # Don't store PDFs, text-only
116 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem
117 DATABASE = "database" # Store PDFs encrypted in database
120class SourceType(Base):
121 """
122 Document source types (research_download, user_upload, manual_entry, etc.).
123 Normalized table for consistent categorization.
124 """
126 __tablename__ = "source_types"
128 id = Column(String(36), primary_key=True) # UUID
129 name = Column(String(50), nullable=False, unique=True, index=True)
130 display_name = Column(String(100), nullable=False)
131 description = Column(Text)
132 icon = Column(String(50)) # Icon name for UI
134 # Timestamps
135 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
137 def __repr__(self):
138 return (
139 f"<SourceType(name='{self.name}', display='{self.display_name}')>"
140 )
143class UploadBatch(Base):
144 """
145 Tracks batches of user-uploaded files.
146 Groups uploads for traceability and batch operations.
148 TODO: As of 2026-05 this table is dormant — no code path creates
149 UploadBatch rows or sets Document.upload_batch_id (declared below at
150 column ``upload_batch_id``). Wiring it up needs a product decision on
151 what defines a "batch" (per upload submit, per UI session, etc.) and
152 surfacing the grouping in the upload routes / UI. Schema is left in
153 place because it's harmless and removing it would require a migration
154 against every user's per-user encrypted DB.
155 """
157 __tablename__ = "upload_batches"
159 id = Column(String(36), primary_key=True) # UUID
160 collection_id = Column(
161 String(36),
162 ForeignKey("collections.id", ondelete="CASCADE"),
163 nullable=False,
164 index=True,
165 )
166 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False)
167 file_count = Column(Integer, default=0)
168 total_size = Column(Integer, default=0) # Total bytes
170 # Relationships
171 collection = relationship("Collection", backref="upload_batches")
172 documents = relationship("Document", backref="upload_batch")
174 def __repr__(self):
175 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>"
178class Document(Base):
179 """
180 Unified document table for all documents (research downloads + user uploads).
181 """
183 __tablename__ = "documents"
185 id = Column(String(36), primary_key=True) # UUID as string
187 # Source type (research_download, user_upload, etc.)
188 source_type_id = Column(
189 String(36),
190 ForeignKey("source_types.id"),
191 nullable=False,
192 index=True,
193 )
195 # Link to original research resource (for research downloads) - nullable for uploads
196 resource_id = Column(
197 Integer,
198 ForeignKey("research_resources.id", ondelete="SET NULL"),
199 nullable=True,
200 index=True,
201 )
203 # Link to research (for research downloads) - nullable for uploads
204 research_id = Column(
205 String(36),
206 ForeignKey("research_history.id", ondelete="CASCADE"),
207 nullable=True,
208 index=True,
209 )
211 # Link to upload batch (for user uploads) - nullable for research downloads
212 upload_batch_id = Column(
213 String(36),
214 ForeignKey("upload_batches.id", ondelete="SET NULL"),
215 nullable=True,
216 index=True,
217 )
219 # Document identification
220 document_hash = Column(
221 String(64), nullable=False, unique=True, index=True
222 ) # SHA256 for deduplication
223 original_url = Column(Text, nullable=True) # Source URL (for downloads)
224 filename = Column(String(500), nullable=True) # Display name (for uploads)
225 original_filename = Column(
226 String(500), nullable=True
227 ) # Original upload name
229 # File information
230 file_path = Column(
231 Text, nullable=True
232 ) # Path relative to library/uploads root
233 file_size = Column(Integer, nullable=False) # Size in bytes
234 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc.
235 mime_type = Column(String(100), nullable=True) # MIME type
237 # Content storage - text always stored in DB
238 text_content = Column(
239 Text, nullable=True
240 ) # Extracted/uploaded text content
242 # PDF storage mode (none, filesystem, database)
243 storage_mode = Column(
244 String(20), nullable=True, default="database"
245 ) # PDFStorageMode value
247 # Metadata
248 title = Column(Text) # Document title
249 description = Column(Text) # User description
250 authors = Column(JSON) # List of authors (for research papers)
251 published_date = Column(Date, nullable=True) # Publication date
253 # Academic identifiers (for research papers)
254 doi = Column(String(255), nullable=True, index=True)
255 arxiv_id = Column(String(100), nullable=True, index=True)
256 pmid = Column(String(50), nullable=True, index=True)
257 pmcid = Column(String(50), nullable=True, index=True)
258 isbn = Column(String(20), nullable=True)
260 # Download/Upload information
261 status = Column(
262 Enum(
263 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
264 ),
265 nullable=False,
266 default=DocumentStatus.COMPLETED,
267 )
268 attempts = Column(Integer, default=1)
269 error_message = Column(Text, nullable=True)
270 processed_at = Column(UtcDateTime, nullable=False, default=utcnow())
271 last_accessed = Column(UtcDateTime, nullable=True)
273 # Text extraction metadata (for research downloads from PDFs)
274 extraction_method = Column(
275 String(50), nullable=True
276 ) # pdf_extraction, native_api, etc.
277 extraction_source = Column(
278 String(50), nullable=True
279 ) # arxiv_api, pdfplumber, etc.
280 extraction_quality = Column(String(20), nullable=True) # high, medium, low
281 has_formatting_issues = Column(Boolean, default=False)
282 has_encoding_issues = Column(Boolean, default=False)
283 character_count = Column(Integer, nullable=True)
284 word_count = Column(Integer, nullable=True)
286 # Organization
287 tags = Column(JSON) # User-defined tags
288 notes = Column(Text) # User notes
289 favorite = Column(Boolean, default=False)
291 # Timestamps
292 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
293 updated_at = Column(
294 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
295 )
297 # Relationships
298 source_type = relationship("SourceType", backref="documents")
299 resource = relationship(
300 "ResearchResource",
301 foreign_keys="[Document.resource_id]",
302 backref="documents",
303 )
304 research = relationship("ResearchHistory", backref="documents")
305 collections = relationship(
306 "DocumentCollection",
307 back_populates="document",
308 cascade="all, delete-orphan",
309 )
311 # Indexes for efficient queries
312 __table_args__ = (
313 Index("idx_source_type", "source_type_id", "status"),
314 Index("idx_research_documents", "research_id", "status"),
315 Index("idx_document_type", "file_type", "status"),
316 Index("idx_document_hash", "document_hash"),
317 )
319 def __repr__(self):
320 title_str = (
321 self.title[:50]
322 if self.title
323 else (self.filename[:50] if self.filename else "Untitled")
324 )
325 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>"
328class DocumentBlob(Base):
329 """
330 Separate table for storing PDF binary content.
331 SQLite best practices: keep BLOBs in separate table for better query performance.
332 Stored in encrypted SQLCipher database for security.
333 """
335 __tablename__ = "document_blobs"
337 # Primary key references Document.id
338 document_id = Column(
339 String(36),
340 ForeignKey("documents.id", ondelete="CASCADE"),
341 primary_key=True,
342 nullable=False,
343 )
345 # Binary PDF content
346 pdf_binary = Column(LargeBinary, nullable=False)
348 # Hash for integrity verification
349 blob_hash = Column(String(64), nullable=True, index=True) # SHA256
351 # Timestamps
352 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False)
353 last_accessed = Column(UtcDateTime, nullable=True)
355 # Relationship
356 document = relationship(
357 "Document",
358 backref=backref("blob", passive_deletes=True),
359 passive_deletes=True,
360 )
362 def __repr__(self):
363 size = len(self.pdf_binary) if self.pdf_binary else 0
364 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>"
367class Collection(Base):
368 """
369 Collections for organizing documents.
370 'Library' is the default collection for research downloads.
371 Users can create custom collections for organization.
372 """
374 __tablename__ = "collections"
376 id = Column(String(36), primary_key=True) # UUID as string
377 name = Column(String(255), nullable=False)
378 description = Column(Text)
380 # Collection type (default_library, user_collection, linked_folder)
381 collection_type = Column(String(50), default="user_collection")
383 # Is this the default library collection?
384 is_default = Column(Boolean, default=False)
386 # Embedding model used for this collection (stored when first indexed)
387 embedding_model = Column(
388 String(100), nullable=True
389 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest'
390 embedding_model_type = Column(
391 Enum(
392 EmbeddingProvider,
393 values_callable=lambda obj: [e.value for e in obj],
394 ),
395 nullable=True,
396 )
397 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
398 chunk_size = Column(Integer, nullable=True) # Chunk size used
399 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used
401 # Advanced embedding configuration options (Issue #1054)
402 splitter_type = Column(
403 String(50), nullable=True
404 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
405 text_separators = Column(
406 JSON, nullable=True
407 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
408 distance_metric = Column(
409 String(50), nullable=True
410 ) # Distance metric: 'cosine', 'l2', 'dot_product'
411 normalize_vectors = Column(
412 Boolean, nullable=True
413 ) # Whether to normalize embeddings with L2
414 index_type = Column(
415 String(50), nullable=True
416 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
418 # Timestamps
419 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
420 updated_at = Column(
421 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
422 )
424 # Relationships
425 document_links = relationship(
426 "DocumentCollection",
427 back_populates="collection",
428 cascade="all, delete-orphan",
429 )
430 linked_folders = relationship(
431 "CollectionFolder",
432 back_populates="collection",
433 cascade="all, delete-orphan",
434 )
436 def __repr__(self):
437 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>"
440class DocumentCollection(Base):
441 """
442 Many-to-many relationship between documents and collections.
443 Tracks indexing status per collection (documents can be in multiple collections).
444 """
446 __tablename__ = "document_collections"
448 id = Column(Integer, primary_key=True, autoincrement=True)
450 # Foreign keys
451 document_id = Column(
452 String(36),
453 ForeignKey("documents.id", ondelete="CASCADE"),
454 nullable=False,
455 index=True,
456 )
457 collection_id = Column(
458 String(36),
459 ForeignKey("collections.id", ondelete="CASCADE"),
460 nullable=False,
461 index=True,
462 )
464 # Indexing status (per collection!)
465 indexed = Column(
466 Boolean, default=False
467 ) # Whether indexed for this collection
468 chunk_count = Column(
469 Integer, default=0
470 ) # Number of chunks in this collection
471 last_indexed_at = Column(UtcDateTime, nullable=True)
473 # Timestamps
474 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)
476 # Relationships
477 document = relationship("Document", back_populates="collections")
478 collection = relationship("Collection", back_populates="document_links")
480 # Ensure one entry per document-collection pair
481 __table_args__ = (
482 UniqueConstraint(
483 "document_id", "collection_id", name="uix_document_collection"
484 ),
485 Index("idx_collection_indexed", "collection_id", "indexed"),
486 )
488 def __repr__(self):
489 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>"
492class DocumentChunk(Base):
493 """
494 Universal chunk storage for RAG across all sources.
495 Stores text chunks in encrypted database for semantic search.
496 """
498 __tablename__ = "document_chunks"
500 id = Column(Integer, primary_key=True, autoincrement=True)
502 # Chunk identification
503 chunk_hash = Column(
504 String(64), nullable=False, index=True
505 ) # SHA256 for deduplication
507 # Source tracking - now points to unified Document table
508 source_type = Column(
509 String(20), nullable=False, index=True
510 ) # 'document', 'folder_file'
511 source_id = Column(
512 String(36), nullable=True, index=True
513 ) # Document.id (UUID as string)
514 source_path = Column(
515 Text, nullable=True
516 ) # File path if local collection source
517 collection_name = Column(
518 String(100), nullable=False, index=True
519 ) # collection_<uuid>
521 # Chunk content (encrypted in SQLCipher DB)
522 chunk_text = Column(Text, nullable=False) # The actual chunk text
523 chunk_index = Column(Integer, nullable=False) # Position in source document
524 start_char = Column(Integer, nullable=False) # Start character position
525 end_char = Column(Integer, nullable=False) # End character position
526 word_count = Column(Integer, nullable=False) # Number of words in chunk
528 # Embedding metadata
529 embedding_id = Column(
530 String(36), nullable=False, unique=True, index=True
531 ) # UUID for FAISS vector mapping
532 embedding_model = Column(
533 String(100), nullable=False
534 ) # e.g., 'all-MiniLM-L6-v2'
535 embedding_model_type = Column(
536 Enum(
537 EmbeddingProvider,
538 values_callable=lambda obj: [e.value for e in obj],
539 ),
540 nullable=False,
541 )
542 embedding_dimension = Column(Integer, nullable=True) # Vector dimension
544 # Document metadata (for context)
545 document_title = Column(Text, nullable=True) # Title of source document
546 document_metadata = Column(
547 JSON, nullable=True
548 ) # Additional metadata from source
550 # Timestamps
551 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
552 last_accessed = Column(UtcDateTime, nullable=True)
554 # Indexes for efficient queries
555 __table_args__ = (
556 UniqueConstraint(
557 "chunk_hash", "collection_name", name="uix_chunk_collection"
558 ),
559 Index("idx_chunk_source", "source_type", "source_id"),
560 Index("idx_chunk_collection", "collection_name", "created_at"),
561 Index("idx_chunk_embedding", "embedding_id"),
562 )
564 def __repr__(self):
565 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>"
568class DownloadQueue(Base):
569 """
570 Queue for pending document downloads.
571 Renamed from LibraryDownloadQueue for consistency.
572 """
574 __tablename__ = "download_queue"
576 id = Column(Integer, primary_key=True, autoincrement=True)
578 # What to download
579 resource_id = Column(
580 Integer,
581 ForeignKey("research_resources.id", ondelete="CASCADE"),
582 nullable=False,
583 unique=True, # One queue entry per resource
584 )
585 research_id = Column(String(36), nullable=False, index=True)
587 # Target collection (defaults to Library collection)
588 collection_id = Column(
589 String(36),
590 ForeignKey("collections.id", ondelete="SET NULL"),
591 nullable=True,
592 index=True,
593 )
595 # Queue management
596 priority = Column(Integer, default=0) # Higher = more important
597 status = Column(
598 Enum(
599 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]
600 ),
601 nullable=False,
602 default=DocumentStatus.PENDING,
603 )
604 attempts = Column(Integer, default=0)
605 max_attempts = Column(Integer, default=3)
607 # Error tracking
608 last_error = Column(Text, nullable=True)
609 last_attempt_at = Column(UtcDateTime, nullable=True)
611 # Timestamps
612 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False)
613 completed_at = Column(UtcDateTime, nullable=True)
615 # Relationships
616 resource = relationship("ResearchResource", backref="download_queue")
617 collection = relationship("Collection", backref="download_queue_items")
619 def __repr__(self):
620 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>"
623class LibraryStatistics(Base):
624 """
625 Aggregate statistics for the library.
626 Updated periodically for dashboard display.
627 """
629 __tablename__ = "library_statistics"
631 id = Column(Integer, primary_key=True, autoincrement=True)
633 # Document counts
634 total_documents = Column(Integer, default=0)
635 total_pdfs = Column(Integer, default=0)
636 total_html = Column(Integer, default=0)
637 total_other = Column(Integer, default=0)
639 # Storage metrics
640 total_size_bytes = Column(Integer, default=0)
641 average_document_size = Column(Integer, default=0)
643 # Research metrics
644 total_researches_with_downloads = Column(Integer, default=0)
645 average_documents_per_research = Column(Integer, default=0)
647 # Download metrics
648 total_download_attempts = Column(Integer, default=0)
649 successful_downloads = Column(Integer, default=0)
650 failed_downloads = Column(Integer, default=0)
651 pending_downloads = Column(Integer, default=0)
653 # Academic sources breakdown
654 arxiv_count = Column(Integer, default=0)
655 pubmed_count = Column(Integer, default=0)
656 doi_count = Column(Integer, default=0)
657 other_count = Column(Integer, default=0)
659 # Timestamps
660 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False)
662 def __repr__(self):
663 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>"
666class RAGIndex(Base):
667 """
668 Tracks FAISS indices for RAG collections.
669 Each collection+embedding_model combination has its own FAISS index.
670 """
672 __tablename__ = "rag_indices"
674 id = Column(Integer, primary_key=True, autoincrement=True)
676 # Collection and model identification
677 collection_name = Column(
678 String(100), nullable=False, index=True
679 ) # 'collection_<uuid>'
680 embedding_model = Column(
681 String(100), nullable=False
682 ) # e.g., 'all-MiniLM-L6-v2'
683 embedding_model_type = Column(
684 Enum(
685 EmbeddingProvider,
686 values_callable=lambda obj: [e.value for e in obj],
687 ),
688 nullable=False,
689 )
690 embedding_dimension = Column(Integer, nullable=False) # Vector dimension
692 # Index file location
693 index_path = Column(Text, nullable=False) # Path to .faiss file
694 index_hash = Column(
695 String(64), nullable=False, unique=True, index=True
696 ) # SHA256 of collection+model for uniqueness
698 # Chunking parameters used
699 chunk_size = Column(Integer, nullable=False)
700 chunk_overlap = Column(Integer, nullable=False)
702 # Advanced embedding configuration options (Issue #1054)
703 splitter_type = Column(
704 String(50), nullable=True
705 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'
706 text_separators = Column(
707 JSON, nullable=True
708 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]
709 distance_metric = Column(
710 String(50), nullable=True
711 ) # Distance metric: 'cosine', 'l2', 'dot_product'
712 normalize_vectors = Column(
713 Boolean, nullable=True
714 ) # Whether to normalize embeddings with L2
715 index_type = Column(
716 String(50), nullable=True
717 ) # FAISS index type: 'flat', 'hnsw', 'ivf'
719 # Index statistics
720 chunk_count = Column(Integer, default=0) # Number of chunks in this index
721 total_documents = Column(Integer, default=0) # Number of source documents
723 # Status
724 status = Column(
725 Enum(
726 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj]
727 ),
728 nullable=False,
729 default=RAGIndexStatus.ACTIVE,
730 )
731 is_current = Column(
732 Boolean, default=True
733 ) # Whether this is the current index for this collection
735 # Timestamps
736 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
737 last_updated_at = Column(
738 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
739 )
740 last_used_at = Column(
741 UtcDateTime, nullable=True
742 ) # Last time index was searched
744 # Ensure one active index per collection+model
745 __table_args__ = (
746 UniqueConstraint(
747 "collection_name",
748 "embedding_model",
749 "embedding_model_type",
750 name="uix_collection_model",
751 ),
752 Index("idx_collection_current", "collection_name", "is_current"),
753 )
755 def __repr__(self):
756 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>"
759class RagDocumentStatus(Base):
760 """
761 Tracks which documents have been indexed for RAG.
762 Row existence = document is indexed. No row = not indexed.
763 Simple and avoids ORM caching issues.
764 """
766 __tablename__ = "rag_document_status"
768 # Composite primary key
769 document_id = Column(
770 String(36),
771 ForeignKey("documents.id", ondelete="CASCADE"),
772 primary_key=True,
773 nullable=False,
774 )
775 collection_id = Column(
776 String(36),
777 ForeignKey("collections.id", ondelete="CASCADE"),
778 primary_key=True,
779 nullable=False,
780 )
782 # Which RAG index was used (tracks embedding model indirectly)
783 rag_index_id = Column(
784 Integer,
785 ForeignKey("rag_indices.id", ondelete="CASCADE"),
786 nullable=False,
787 index=True,
788 )
790 # Metadata
791 chunk_count = Column(Integer, nullable=False)
792 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow())
794 # Indexes for fast lookups
795 __table_args__ = (
796 Index("idx_rag_status_collection", "collection_id"),
797 Index("idx_rag_status_index", "rag_index_id"),
798 )
800 def __repr__(self):
801 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>"
804class CollectionFolder(Base):
805 """
806 Local folders linked to a collection for indexing.
807 """
809 __tablename__ = "collection_folders"
811 id = Column(Integer, primary_key=True, autoincrement=True)
813 # Collection association
814 collection_id = Column(
815 String(36),
816 ForeignKey("collections.id", ondelete="CASCADE"),
817 nullable=False,
818 index=True,
819 )
821 # Folder configuration
822 folder_path = Column(Text, nullable=False) # Absolute path to folder
823 include_patterns = Column(
824 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"]
825 ) # File patterns to include
826 exclude_patterns = Column(
827 JSON
828 ) # Patterns to exclude (e.g., ["**/node_modules/**"])
829 recursive = Column(Boolean, default=True) # Search subfolders
831 # Monitoring
832 watch_enabled = Column(
833 Boolean, default=False
834 ) # Auto-reindex on changes (future)
835 last_scanned_at = Column(UtcDateTime, nullable=True)
836 file_count = Column(Integer, default=0) # Total files found
837 indexed_file_count = Column(Integer, default=0) # Files indexed
839 # Timestamps
840 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)
841 updated_at = Column(
842 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
843 )
845 # Relationships
846 collection = relationship("Collection", back_populates="linked_folders")
847 files = relationship(
848 "CollectionFolderFile",
849 back_populates="folder",
850 cascade="all, delete-orphan",
851 )
853 def __repr__(self):
854 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>"
857class CollectionFolderFile(Base):
858 """
859 Files found in linked folders.
860 Lightweight tracking for deduplication and indexing status.
861 """
863 __tablename__ = "collection_folder_files"
865 id = Column(Integer, primary_key=True, autoincrement=True)
867 # Folder association
868 folder_id = Column(
869 Integer,
870 ForeignKey("collection_folders.id", ondelete="CASCADE"),
871 nullable=False,
872 index=True,
873 )
875 # File identification
876 relative_path = Column(Text, nullable=False) # Path relative to folder_path
877 file_hash = Column(String(64), index=True) # SHA256 for deduplication
878 file_size = Column(Integer) # Size in bytes
879 file_type = Column(String(50)) # Extension
881 # File metadata
882 last_modified = Column(UtcDateTime) # File modification time
884 # Indexing status
885 indexed = Column(Boolean, default=False)
886 chunk_count = Column(Integer, default=0)
887 last_indexed_at = Column(UtcDateTime, nullable=True)
888 index_error = Column(Text, nullable=True) # Error if indexing failed
890 # Timestamps
891 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False)
892 updated_at = Column(
893 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False
894 )
896 # Relationships
897 folder = relationship("CollectionFolder", back_populates="files")
899 # Ensure one entry per file in folder
900 __table_args__ = (
901 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"),
902 Index("idx_folder_indexed", "folder_id", "indexed"),
903 )
905 def __repr__(self):
906 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"