Coverage for src / local_deep_research / database / models / library.py: 99%

306 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Library and document models - Unified architecture. 

3All documents (research downloads and user uploads) are stored in one table. 

4Collections organize documents, with "Library" as the default collection. 

5""" 

6 

7import enum 

8 

9from sqlalchemy import ( 

10 JSON, 

11 Boolean, 

12 Column, 

13 Date, 

14 Enum, 

15 ForeignKey, 

16 Index, 

17 Integer, 

18 LargeBinary, 

19 String, 

20 Text, 

21 UniqueConstraint, 

22) 

23from sqlalchemy.orm import backref, relationship 

24from sqlalchemy_utc import UtcDateTime, utcnow 

25 

26from .base import Base 

27 

28 

29class RAGIndexStatus(enum.Enum): 

30 """Status values for RAG indices.""" 

31 

32 ACTIVE = "active" 

33 REBUILDING = "rebuilding" 

34 DEPRECATED = "deprecated" 

35 

36 

37class DocumentStatus(enum.Enum): 

38 """Status values for document processing and downloads.""" 

39 

40 PENDING = "pending" 

41 PROCESSING = "processing" 

42 COMPLETED = "completed" 

43 FAILED = "failed" 

44 

45 

46class EmbeddingProvider(enum.Enum): 

47 """Embedding model provider types.""" 

48 

49 SENTENCE_TRANSFORMERS = "sentence_transformers" 

50 OLLAMA = "ollama" 

51 

52 

53class ExtractionMethod(str, enum.Enum): 

54 """Methods used to extract text from documents.""" 

55 

56 PDF_EXTRACTION = "pdf_extraction" 

57 NATIVE_API = "native_api" 

58 UNKNOWN = "unknown" 

59 

60 

61class ExtractionSource(str, enum.Enum): 

62 """Sources used for text extraction.""" 

63 

64 ARXIV_API = "arxiv_api" 

65 PUBMED_API = "pubmed_api" 

66 PDFPLUMBER = "pdfplumber" 

67 PDFPLUMBER_FALLBACK = "pdfplumber_fallback" 

68 LOCAL_PDF = "local_pdf" 

69 LEGACY_FILE = "legacy_file" 

70 

71 

72class ExtractionQuality(str, enum.Enum): 

73 """Quality levels for extracted text.""" 

74 

75 HIGH = "high" 

76 MEDIUM = "medium" 

77 LOW = "low" 

78 

79 

80class DistanceMetric(str, enum.Enum): 

81 """Distance metrics for vector similarity search.""" 

82 

83 COSINE = "cosine" 

84 L2 = "l2" 

85 DOT_PRODUCT = "dot_product" 

86 

87 

88class IndexType(str, enum.Enum): 

89 """FAISS index types for RAG.""" 

90 

91 FLAT = "flat" 

92 HNSW = "hnsw" 

93 IVF = "ivf" 

94 

95 

96class SplitterType(str, enum.Enum): 

97 """Text splitter types for chunking.""" 

98 

99 RECURSIVE = "recursive" 

100 SEMANTIC = "semantic" 

101 TOKEN = "token" 

102 SENTENCE = "sentence" 

103 

104 

105class PDFStorageMode(str, enum.Enum): 

106 """Storage modes for PDF files.""" 

107 

108 NONE = "none" # Don't store PDFs, text-only 

109 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem 

110 DATABASE = "database" # Store PDFs encrypted in database 

111 

112 

113class SourceType(Base): 

114 """ 

115 Document source types (research_download, user_upload, manual_entry, etc.). 

116 Normalized table for consistent categorization. 

117 """ 

118 

119 __tablename__ = "source_types" 

120 

121 id = Column(String(36), primary_key=True) # UUID 

122 name = Column(String(50), nullable=False, unique=True, index=True) 

123 display_name = Column(String(100), nullable=False) 

124 description = Column(Text) 

125 icon = Column(String(50)) # Icon name for UI 

126 

127 # Timestamps 

128 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

129 

130 def __repr__(self): 

131 return ( 

132 f"<SourceType(name='{self.name}', display='{self.display_name}')>" 

133 ) 

134 

135 

136class UploadBatch(Base): 

137 """ 

138 Tracks batches of user-uploaded files. 

139 Groups uploads for traceability and batch operations. 

140 """ 

141 

142 __tablename__ = "upload_batches" 

143 

144 id = Column(String(36), primary_key=True) # UUID 

145 collection_id = Column( 

146 String(36), 

147 ForeignKey("collections.id", ondelete="CASCADE"), 

148 nullable=False, 

149 index=True, 

150 ) 

151 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

152 file_count = Column(Integer, default=0) 

153 total_size = Column(Integer, default=0) # Total bytes 

154 

155 # Relationships 

156 collection = relationship("Collection", backref="upload_batches") 

157 documents = relationship("Document", backref="upload_batch") 

158 

159 def __repr__(self): 

160 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>" 

161 

162 

163class Document(Base): 

164 """ 

165 Unified document table for all documents (research downloads + user uploads). 

166 """ 

167 

168 __tablename__ = "documents" 

169 

170 id = Column(String(36), primary_key=True) # UUID as string 

171 

172 # Source type (research_download, user_upload, etc.) 

173 source_type_id = Column( 

174 String(36), 

175 ForeignKey("source_types.id"), 

176 nullable=False, 

177 index=True, 

178 ) 

179 

180 # Link to original research resource (for research downloads) - nullable for uploads 

181 resource_id = Column( 

182 Integer, 

183 ForeignKey("research_resources.id", ondelete="SET NULL"), 

184 nullable=True, 

185 index=True, 

186 ) 

187 

188 # Link to research (for research downloads) - nullable for uploads 

189 research_id = Column( 

190 String(36), 

191 ForeignKey("research_history.id", ondelete="CASCADE"), 

192 nullable=True, 

193 index=True, 

194 ) 

195 

196 # Link to upload batch (for user uploads) - nullable for research downloads 

197 upload_batch_id = Column( 

198 String(36), 

199 ForeignKey("upload_batches.id", ondelete="SET NULL"), 

200 nullable=True, 

201 index=True, 

202 ) 

203 

204 # Document identification 

205 document_hash = Column( 

206 String(64), nullable=False, unique=True, index=True 

207 ) # SHA256 for deduplication 

208 original_url = Column(Text, nullable=True) # Source URL (for downloads) 

209 filename = Column(String(500), nullable=True) # Display name (for uploads) 

210 original_filename = Column( 

211 String(500), nullable=True 

212 ) # Original upload name 

213 

214 # File information 

215 file_path = Column( 

216 Text, nullable=True 

217 ) # Path relative to library/uploads root 

218 file_size = Column(Integer, nullable=False) # Size in bytes 

219 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc. 

220 mime_type = Column(String(100), nullable=True) # MIME type 

221 

222 # Content storage - text always stored in DB 

223 text_content = Column( 

224 Text, nullable=True 

225 ) # Extracted/uploaded text content 

226 

227 # PDF storage mode (none, filesystem, database) 

228 storage_mode = Column( 

229 String(20), nullable=True, default="database" 

230 ) # PDFStorageMode value 

231 

232 # Metadata 

233 title = Column(Text) # Document title 

234 description = Column(Text) # User description 

235 authors = Column(JSON) # List of authors (for research papers) 

236 published_date = Column(Date, nullable=True) # Publication date 

237 

238 # Academic identifiers (for research papers) 

239 doi = Column(String(255), nullable=True, index=True) 

240 arxiv_id = Column(String(100), nullable=True, index=True) 

241 pmid = Column(String(50), nullable=True, index=True) 

242 pmcid = Column(String(50), nullable=True, index=True) 

243 isbn = Column(String(20), nullable=True) 

244 

245 # Download/Upload information 

246 status = Column( 

247 Enum( 

248 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

249 ), 

250 nullable=False, 

251 default=DocumentStatus.COMPLETED, 

252 ) 

253 attempts = Column(Integer, default=1) 

254 error_message = Column(Text, nullable=True) 

255 processed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

256 last_accessed = Column(UtcDateTime, nullable=True) 

257 

258 # Text extraction metadata (for research downloads from PDFs) 

259 extraction_method = Column( 

260 String(50), nullable=True 

261 ) # pdf_extraction, native_api, etc. 

262 extraction_source = Column( 

263 String(50), nullable=True 

264 ) # arxiv_api, pdfplumber, etc. 

265 extraction_quality = Column(String(20), nullable=True) # high, medium, low 

266 has_formatting_issues = Column(Boolean, default=False) 

267 has_encoding_issues = Column(Boolean, default=False) 

268 character_count = Column(Integer, nullable=True) 

269 word_count = Column(Integer, nullable=True) 

270 

271 # Organization 

272 tags = Column(JSON) # User-defined tags 

273 notes = Column(Text) # User notes 

274 favorite = Column(Boolean, default=False) 

275 

276 # Timestamps 

277 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

278 updated_at = Column( 

279 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

280 ) 

281 

282 # Relationships 

283 source_type = relationship("SourceType", backref="documents") 

284 resource = relationship( 

285 "ResearchResource", 

286 foreign_keys="[Document.resource_id]", 

287 backref="documents", 

288 ) 

289 research = relationship("ResearchHistory", backref="documents") 

290 collections = relationship( 

291 "DocumentCollection", 

292 back_populates="document", 

293 cascade="all, delete-orphan", 

294 ) 

295 

296 # Indexes for efficient queries 

297 __table_args__ = ( 

298 Index("idx_source_type", "source_type_id", "status"), 

299 Index("idx_research_documents", "research_id", "status"), 

300 Index("idx_document_type", "file_type", "status"), 

301 Index("idx_document_hash", "document_hash"), 

302 ) 

303 

304 def __repr__(self): 

305 title_str = ( 

306 self.title[:50] 

307 if self.title 

308 else (self.filename[:50] if self.filename else "Untitled") 

309 ) 

310 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>" 

311 

312 

313class DocumentBlob(Base): 

314 """ 

315 Separate table for storing PDF binary content. 

316 SQLite best practices: keep BLOBs in separate table for better query performance. 

317 Stored in encrypted SQLCipher database for security. 

318 """ 

319 

320 __tablename__ = "document_blobs" 

321 

322 # Primary key references Document.id 

323 document_id = Column( 

324 String(36), 

325 ForeignKey("documents.id", ondelete="CASCADE"), 

326 primary_key=True, 

327 nullable=False, 

328 ) 

329 

330 # Binary PDF content 

331 pdf_binary = Column(LargeBinary, nullable=False) 

332 

333 # Hash for integrity verification 

334 blob_hash = Column(String(64), nullable=True, index=True) # SHA256 

335 

336 # Timestamps 

337 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

338 last_accessed = Column(UtcDateTime, nullable=True) 

339 

340 # Relationship 

341 document = relationship( 

342 "Document", 

343 backref=backref("blob", passive_deletes=True), 

344 passive_deletes=True, 

345 ) 

346 

347 def __repr__(self): 

348 size = len(self.pdf_binary) if self.pdf_binary else 0 

349 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>" 

350 

351 

352class Collection(Base): 

353 """ 

354 Collections for organizing documents. 

355 'Library' is the default collection for research downloads. 

356 Users can create custom collections for organization. 

357 """ 

358 

359 __tablename__ = "collections" 

360 

361 id = Column(String(36), primary_key=True) # UUID as string 

362 name = Column(String(255), nullable=False) 

363 description = Column(Text) 

364 

365 # Collection type (default_library, user_collection, linked_folder) 

366 collection_type = Column(String(50), default="user_collection") 

367 

368 # Is this the default library collection? 

369 is_default = Column(Boolean, default=False) 

370 

371 # Embedding model used for this collection (stored when first indexed) 

372 embedding_model = Column( 

373 String(100), nullable=True 

374 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest' 

375 embedding_model_type = Column( 

376 Enum( 

377 EmbeddingProvider, 

378 values_callable=lambda obj: [e.value for e in obj], 

379 ), 

380 nullable=True, 

381 ) 

382 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

383 chunk_size = Column(Integer, nullable=True) # Chunk size used 

384 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used 

385 

386 # Advanced embedding configuration options (Issue #1054) 

387 splitter_type = Column( 

388 String(50), nullable=True 

389 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

390 text_separators = Column( 

391 JSON, nullable=True 

392 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

393 distance_metric = Column( 

394 String(50), nullable=True 

395 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

396 normalize_vectors = Column( 

397 Boolean, nullable=True 

398 ) # Whether to normalize embeddings with L2 

399 index_type = Column( 

400 String(50), nullable=True 

401 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

402 

403 # Timestamps 

404 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

405 updated_at = Column( 

406 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

407 ) 

408 

409 # Relationships 

410 document_links = relationship( 

411 "DocumentCollection", 

412 back_populates="collection", 

413 cascade="all, delete-orphan", 

414 ) 

415 linked_folders = relationship( 

416 "CollectionFolder", 

417 back_populates="collection", 

418 cascade="all, delete-orphan", 

419 ) 

420 

421 def __repr__(self): 

422 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>" 

423 

424 

425class DocumentCollection(Base): 

426 """ 

427 Many-to-many relationship between documents and collections. 

428 Tracks indexing status per collection (documents can be in multiple collections). 

429 """ 

430 

431 __tablename__ = "document_collections" 

432 

433 id = Column(Integer, primary_key=True, autoincrement=True) 

434 

435 # Foreign keys 

436 document_id = Column( 

437 String(36), 

438 ForeignKey("documents.id", ondelete="CASCADE"), 

439 nullable=False, 

440 index=True, 

441 ) 

442 collection_id = Column( 

443 String(36), 

444 ForeignKey("collections.id", ondelete="CASCADE"), 

445 nullable=False, 

446 index=True, 

447 ) 

448 

449 # Indexing status (per collection!) 

450 indexed = Column( 

451 Boolean, default=False 

452 ) # Whether indexed for this collection 

453 chunk_count = Column( 

454 Integer, default=0 

455 ) # Number of chunks in this collection 

456 last_indexed_at = Column(UtcDateTime, nullable=True) 

457 

458 # Timestamps 

459 added_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

460 

461 # Relationships 

462 document = relationship("Document", back_populates="collections") 

463 collection = relationship("Collection", back_populates="document_links") 

464 

465 # Ensure one entry per document-collection pair 

466 __table_args__ = ( 

467 UniqueConstraint( 

468 "document_id", "collection_id", name="uix_document_collection" 

469 ), 

470 Index("idx_collection_indexed", "collection_id", "indexed"), 

471 ) 

472 

473 def __repr__(self): 

474 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>" 

475 

476 

477class DocumentChunk(Base): 

478 """ 

479 Universal chunk storage for RAG across all sources. 

480 Stores text chunks in encrypted database for semantic search. 

481 """ 

482 

483 __tablename__ = "document_chunks" 

484 

485 id = Column(Integer, primary_key=True, autoincrement=True) 

486 

487 # Chunk identification 

488 chunk_hash = Column( 

489 String(64), nullable=False, index=True 

490 ) # SHA256 for deduplication 

491 

492 # Source tracking - now points to unified Document table 

493 source_type = Column( 

494 String(20), nullable=False, index=True 

495 ) # 'document', 'folder_file' 

496 source_id = Column( 

497 String(36), nullable=True, index=True 

498 ) # Document.id (UUID as string) 

499 source_path = Column( 

500 Text, nullable=True 

501 ) # File path if local collection source 

502 collection_name = Column( 

503 String(100), nullable=False, index=True 

504 ) # collection_<uuid> 

505 

506 # Chunk content (encrypted in SQLCipher DB) 

507 chunk_text = Column(Text, nullable=False) # The actual chunk text 

508 chunk_index = Column(Integer, nullable=False) # Position in source document 

509 start_char = Column(Integer, nullable=False) # Start character position 

510 end_char = Column(Integer, nullable=False) # End character position 

511 word_count = Column(Integer, nullable=False) # Number of words in chunk 

512 

513 # Embedding metadata 

514 embedding_id = Column( 

515 String(36), nullable=False, unique=True, index=True 

516 ) # UUID for FAISS vector mapping 

517 embedding_model = Column( 

518 String(100), nullable=False 

519 ) # e.g., 'all-MiniLM-L6-v2' 

520 embedding_model_type = Column( 

521 Enum( 

522 EmbeddingProvider, 

523 values_callable=lambda obj: [e.value for e in obj], 

524 ), 

525 nullable=False, 

526 ) 

527 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

528 

529 # Document metadata (for context) 

530 document_title = Column(Text, nullable=True) # Title of source document 

531 document_metadata = Column( 

532 JSON, nullable=True 

533 ) # Additional metadata from source 

534 

535 # Timestamps 

536 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

537 last_accessed = Column(UtcDateTime, nullable=True) 

538 

539 # Indexes for efficient queries 

540 __table_args__ = ( 

541 UniqueConstraint( 

542 "chunk_hash", "collection_name", name="uix_chunk_collection" 

543 ), 

544 Index("idx_chunk_source", "source_type", "source_id"), 

545 Index("idx_chunk_collection", "collection_name", "created_at"), 

546 Index("idx_chunk_embedding", "embedding_id"), 

547 ) 

548 

549 def __repr__(self): 

550 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>" 

551 

552 

553class DownloadQueue(Base): 

554 """ 

555 Queue for pending document downloads. 

556 Renamed from LibraryDownloadQueue for consistency. 

557 """ 

558 

559 __tablename__ = "download_queue" 

560 

561 id = Column(Integer, primary_key=True, autoincrement=True) 

562 

563 # What to download 

564 resource_id = Column( 

565 Integer, 

566 ForeignKey("research_resources.id", ondelete="CASCADE"), 

567 nullable=False, 

568 unique=True, # One queue entry per resource 

569 ) 

570 research_id = Column(String(36), nullable=False, index=True) 

571 

572 # Target collection (defaults to Library collection) 

573 collection_id = Column( 

574 String(36), 

575 ForeignKey("collections.id", ondelete="SET NULL"), 

576 nullable=True, 

577 index=True, 

578 ) 

579 

580 # Queue management 

581 priority = Column(Integer, default=0) # Higher = more important 

582 status = Column( 

583 Enum( 

584 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

585 ), 

586 nullable=False, 

587 default=DocumentStatus.PENDING, 

588 ) 

589 attempts = Column(Integer, default=0) 

590 max_attempts = Column(Integer, default=3) 

591 

592 # Error tracking 

593 last_error = Column(Text, nullable=True) 

594 last_attempt_at = Column(UtcDateTime, nullable=True) 

595 

596 # Timestamps 

597 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

598 completed_at = Column(UtcDateTime, nullable=True) 

599 

600 # Relationships 

601 resource = relationship("ResearchResource", backref="download_queue") 

602 collection = relationship("Collection", backref="download_queue_items") 

603 

604 def __repr__(self): 

605 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>" 

606 

607 

608class LibraryStatistics(Base): 

609 """ 

610 Aggregate statistics for the library. 

611 Updated periodically for dashboard display. 

612 """ 

613 

614 __tablename__ = "library_statistics" 

615 

616 id = Column(Integer, primary_key=True, autoincrement=True) 

617 

618 # Document counts 

619 total_documents = Column(Integer, default=0) 

620 total_pdfs = Column(Integer, default=0) 

621 total_html = Column(Integer, default=0) 

622 total_other = Column(Integer, default=0) 

623 

624 # Storage metrics 

625 total_size_bytes = Column(Integer, default=0) 

626 average_document_size = Column(Integer, default=0) 

627 

628 # Research metrics 

629 total_researches_with_downloads = Column(Integer, default=0) 

630 average_documents_per_research = Column(Integer, default=0) 

631 

632 # Download metrics 

633 total_download_attempts = Column(Integer, default=0) 

634 successful_downloads = Column(Integer, default=0) 

635 failed_downloads = Column(Integer, default=0) 

636 pending_downloads = Column(Integer, default=0) 

637 

638 # Academic sources breakdown 

639 arxiv_count = Column(Integer, default=0) 

640 pubmed_count = Column(Integer, default=0) 

641 doi_count = Column(Integer, default=0) 

642 other_count = Column(Integer, default=0) 

643 

644 # Timestamps 

645 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

646 

647 def __repr__(self): 

648 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>" 

649 

650 

651class RAGIndex(Base): 

652 """ 

653 Tracks FAISS indices for RAG collections. 

654 Each collection+embedding_model combination has its own FAISS index. 

655 """ 

656 

657 __tablename__ = "rag_indices" 

658 

659 id = Column(Integer, primary_key=True, autoincrement=True) 

660 

661 # Collection and model identification 

662 collection_name = Column( 

663 String(100), nullable=False, index=True 

664 ) # 'collection_<uuid>' 

665 embedding_model = Column( 

666 String(100), nullable=False 

667 ) # e.g., 'all-MiniLM-L6-v2' 

668 embedding_model_type = Column( 

669 Enum( 

670 EmbeddingProvider, 

671 values_callable=lambda obj: [e.value for e in obj], 

672 ), 

673 nullable=False, 

674 ) 

675 embedding_dimension = Column(Integer, nullable=False) # Vector dimension 

676 

677 # Index file location 

678 index_path = Column(Text, nullable=False) # Path to .faiss file 

679 index_hash = Column( 

680 String(64), nullable=False, unique=True, index=True 

681 ) # SHA256 of collection+model for uniqueness 

682 

683 # Chunking parameters used 

684 chunk_size = Column(Integer, nullable=False) 

685 chunk_overlap = Column(Integer, nullable=False) 

686 

687 # Advanced embedding configuration options (Issue #1054) 

688 splitter_type = Column( 

689 String(50), nullable=True 

690 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

691 text_separators = Column( 

692 JSON, nullable=True 

693 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

694 distance_metric = Column( 

695 String(50), nullable=True 

696 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

697 normalize_vectors = Column( 

698 Boolean, nullable=True 

699 ) # Whether to normalize embeddings with L2 

700 index_type = Column( 

701 String(50), nullable=True 

702 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

703 

704 # Index statistics 

705 chunk_count = Column(Integer, default=0) # Number of chunks in this index 

706 total_documents = Column(Integer, default=0) # Number of source documents 

707 

708 # Status 

709 status = Column( 

710 Enum( 

711 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj] 

712 ), 

713 nullable=False, 

714 default=RAGIndexStatus.ACTIVE, 

715 ) 

716 is_current = Column( 

717 Boolean, default=True 

718 ) # Whether this is the current index for this collection 

719 

720 # Timestamps 

721 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

722 last_updated_at = Column( 

723 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

724 ) 

725 last_used_at = Column( 

726 UtcDateTime, nullable=True 

727 ) # Last time index was searched 

728 

729 # Ensure one active index per collection+model 

730 __table_args__ = ( 

731 UniqueConstraint( 

732 "collection_name", 

733 "embedding_model", 

734 "embedding_model_type", 

735 name="uix_collection_model", 

736 ), 

737 Index("idx_collection_current", "collection_name", "is_current"), 

738 ) 

739 

740 def __repr__(self): 

741 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>" 

742 

743 

744class RagDocumentStatus(Base): 

745 """ 

746 Tracks which documents have been indexed for RAG. 

747 Row existence = document is indexed. No row = not indexed. 

748 Simple and avoids ORM caching issues. 

749 """ 

750 

751 __tablename__ = "rag_document_status" 

752 

753 # Composite primary key 

754 document_id = Column( 

755 String(36), 

756 ForeignKey("documents.id", ondelete="CASCADE"), 

757 primary_key=True, 

758 nullable=False, 

759 ) 

760 collection_id = Column( 

761 String(36), 

762 ForeignKey("collections.id", ondelete="CASCADE"), 

763 primary_key=True, 

764 nullable=False, 

765 ) 

766 

767 # Which RAG index was used (tracks embedding model indirectly) 

768 rag_index_id = Column( 

769 Integer, 

770 ForeignKey("rag_indices.id", ondelete="CASCADE"), 

771 nullable=False, 

772 index=True, 

773 ) 

774 

775 # Metadata 

776 chunk_count = Column(Integer, nullable=False) 

777 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

778 

779 # Indexes for fast lookups 

780 __table_args__ = ( 

781 Index("idx_rag_status_collection", "collection_id"), 

782 Index("idx_rag_status_index", "rag_index_id"), 

783 ) 

784 

785 def __repr__(self): 

786 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>" 

787 

788 

789class CollectionFolder(Base): 

790 """ 

791 Local folders linked to a collection for indexing. 

792 """ 

793 

794 __tablename__ = "collection_folders" 

795 

796 id = Column(Integer, primary_key=True, autoincrement=True) 

797 

798 # Collection association 

799 collection_id = Column( 

800 String(36), 

801 ForeignKey("collections.id", ondelete="CASCADE"), 

802 nullable=False, 

803 index=True, 

804 ) 

805 

806 # Folder configuration 

807 folder_path = Column(Text, nullable=False) # Absolute path to folder 

808 include_patterns = Column( 

809 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"] 

810 ) # File patterns to include 

811 exclude_patterns = Column( 

812 JSON 

813 ) # Patterns to exclude (e.g., ["**/node_modules/**"]) 

814 recursive = Column(Boolean, default=True) # Search subfolders 

815 

816 # Monitoring 

817 watch_enabled = Column( 

818 Boolean, default=False 

819 ) # Auto-reindex on changes (future) 

820 last_scanned_at = Column(UtcDateTime, nullable=True) 

821 file_count = Column(Integer, default=0) # Total files found 

822 indexed_file_count = Column(Integer, default=0) # Files indexed 

823 

824 # Timestamps 

825 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

826 updated_at = Column( 

827 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

828 ) 

829 

830 # Relationships 

831 collection = relationship("Collection", back_populates="linked_folders") 

832 files = relationship( 

833 "CollectionFolderFile", 

834 back_populates="folder", 

835 cascade="all, delete-orphan", 

836 ) 

837 

838 def __repr__(self): 

839 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>" 

840 

841 

842class CollectionFolderFile(Base): 

843 """ 

844 Files found in linked folders. 

845 Lightweight tracking for deduplication and indexing status. 

846 """ 

847 

848 __tablename__ = "collection_folder_files" 

849 

850 id = Column(Integer, primary_key=True, autoincrement=True) 

851 

852 # Folder association 

853 folder_id = Column( 

854 Integer, 

855 ForeignKey("collection_folders.id", ondelete="CASCADE"), 

856 nullable=False, 

857 index=True, 

858 ) 

859 

860 # File identification 

861 relative_path = Column(Text, nullable=False) # Path relative to folder_path 

862 file_hash = Column(String(64), index=True) # SHA256 for deduplication 

863 file_size = Column(Integer) # Size in bytes 

864 file_type = Column(String(50)) # Extension 

865 

866 # File metadata 

867 last_modified = Column(UtcDateTime) # File modification time 

868 

869 # Indexing status 

870 indexed = Column(Boolean, default=False) 

871 chunk_count = Column(Integer, default=0) 

872 last_indexed_at = Column(UtcDateTime, nullable=True) 

873 index_error = Column(Text, nullable=True) # Error if indexing failed 

874 

875 # Timestamps 

876 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

877 updated_at = Column( 

878 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

879 ) 

880 

881 # Relationships 

882 folder = relationship("CollectionFolder", back_populates="files") 

883 

884 # Ensure one entry per file in folder 

885 __table_args__ = ( 

886 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"), 

887 Index("idx_folder_indexed", "folder_id", "indexed"), 

888 ) 

889 

890 def __repr__(self): 

891 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"