Coverage for src / local_deep_research / database / models / library.py: 95%

306 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Library and document models - Unified architecture. 

3All documents (research downloads and user uploads) are stored in one table. 

4Collections organize documents, with "Library" as the default collection. 

5""" 

6 

7import enum 

8 

9from sqlalchemy import ( 

10 JSON, 

11 Boolean, 

12 Column, 

13 Date, 

14 Enum, 

15 ForeignKey, 

16 Index, 

17 Integer, 

18 LargeBinary, 

19 String, 

20 Text, 

21 UniqueConstraint, 

22) 

23from sqlalchemy.orm import backref, relationship 

24from sqlalchemy_utc import UtcDateTime, utcnow 

25 

26from .base import Base 

27 

28 

29class RAGIndexStatus(enum.Enum): 

30 """Status values for RAG indices.""" 

31 

32 ACTIVE = "active" 

33 REBUILDING = "rebuilding" 

34 DEPRECATED = "deprecated" 

35 

36 

37class DocumentStatus(enum.Enum): 

38 """Status values for document processing and downloads.""" 

39 

40 PENDING = "pending" 

41 PROCESSING = "processing" 

42 COMPLETED = "completed" 

43 FAILED = "failed" 

44 

45 

46class EmbeddingProvider(enum.Enum): 

47 """Embedding model provider types.""" 

48 

49 SENTENCE_TRANSFORMERS = "sentence_transformers" 

50 OLLAMA = "ollama" 

51 

52 

53class ExtractionMethod(str, enum.Enum): 

54 """Methods used to extract text from documents.""" 

55 

56 PDF_EXTRACTION = "pdf_extraction" 

57 NATIVE_API = "native_api" 

58 UNKNOWN = "unknown" 

59 

60 

61class ExtractionSource(str, enum.Enum): 

62 """Sources used for text extraction.""" 

63 

64 ARXIV_API = "arxiv_api" 

65 PUBMED_API = "pubmed_api" 

66 PDFPLUMBER = "pdfplumber" 

67 PDFPLUMBER_FALLBACK = "pdfplumber_fallback" 

68 LOCAL_PDF = "local_pdf" 

69 LEGACY_FILE = "legacy_file" 

70 

71 

72class ExtractionQuality(str, enum.Enum): 

73 """Quality levels for extracted text.""" 

74 

75 HIGH = "high" 

76 MEDIUM = "medium" 

77 LOW = "low" 

78 

79 

80class DistanceMetric(str, enum.Enum): 

81 """Distance metrics for vector similarity search.""" 

82 

83 COSINE = "cosine" 

84 L2 = "l2" 

85 DOT_PRODUCT = "dot_product" 

86 

87 

88class IndexType(str, enum.Enum): 

89 """FAISS index types for RAG.""" 

90 

91 FLAT = "flat" 

92 HNSW = "hnsw" 

93 IVF = "ivf" 

94 

95 

96class SplitterType(str, enum.Enum): 

97 """Text splitter types for chunking.""" 

98 

99 RECURSIVE = "recursive" 

100 SEMANTIC = "semantic" 

101 TOKEN = "token" 

102 SENTENCE = "sentence" 

103 

104 

105class PDFStorageMode(str, enum.Enum): 

106 """Storage modes for PDF files.""" 

107 

108 NONE = "none" # Don't store PDFs, text-only 

109 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem 

110 DATABASE = "database" # Store PDFs encrypted in database 

111 

112 

113class SourceType(Base): 

114 """ 

115 Document source types (research_download, user_upload, manual_entry, etc.). 

116 Normalized table for consistent categorization. 

117 """ 

118 

119 __tablename__ = "source_types" 

120 

121 id = Column(String(36), primary_key=True) # UUID 

122 name = Column(String(50), nullable=False, unique=True, index=True) 

123 display_name = Column(String(100), nullable=False) 

124 description = Column(Text) 

125 icon = Column(String(50)) # Icon name for UI 

126 

127 # Timestamps 

128 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

129 

130 def __repr__(self): 

131 return ( 

132 f"<SourceType(name='{self.name}', display='{self.display_name}')>" 

133 ) 

134 

135 

136class UploadBatch(Base): 

137 """ 

138 Tracks batches of user-uploaded files. 

139 Groups uploads for traceability and batch operations. 

140 """ 

141 

142 __tablename__ = "upload_batches" 

143 

144 id = Column(String(36), primary_key=True) # UUID 

145 collection_id = Column( 

146 String(36), 

147 ForeignKey("collections.id", ondelete="CASCADE"), 

148 nullable=False, 

149 index=True, 

150 ) 

151 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

152 file_count = Column(Integer, default=0) 

153 total_size = Column(Integer, default=0) # Total bytes 

154 

155 # Relationships 

156 collection = relationship("Collection", backref="upload_batches") 

157 documents = relationship("Document", backref="upload_batch") 

158 

159 def __repr__(self): 

160 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>" 

161 

162 

163class Document(Base): 

164 """ 

165 Unified document table for all documents (research downloads + user uploads). 

166 """ 

167 

168 __tablename__ = "documents" 

169 

170 id = Column(String(36), primary_key=True) # UUID as string 

171 

172 # Source type (research_download, user_upload, etc.) 

173 source_type_id = Column( 

174 String(36), 

175 ForeignKey("source_types.id"), 

176 nullable=False, 

177 index=True, 

178 ) 

179 

180 # Link to original research resource (for research downloads) - nullable for uploads 

181 resource_id = Column( 

182 Integer, 

183 ForeignKey("research_resources.id", ondelete="CASCADE"), 

184 nullable=True, 

185 index=True, 

186 ) 

187 

188 # Link to research (for research downloads) - nullable for uploads 

189 research_id = Column( 

190 String(36), 

191 ForeignKey("research_history.id", ondelete="CASCADE"), 

192 nullable=True, 

193 index=True, 

194 ) 

195 

196 # Link to upload batch (for user uploads) - nullable for research downloads 

197 upload_batch_id = Column( 

198 String(36), 

199 ForeignKey("upload_batches.id", ondelete="SET NULL"), 

200 nullable=True, 

201 index=True, 

202 ) 

203 

204 # Document identification 

205 document_hash = Column( 

206 String(64), nullable=False, unique=True, index=True 

207 ) # SHA256 for deduplication 

208 original_url = Column(Text, nullable=True) # Source URL (for downloads) 

209 filename = Column(String(500), nullable=True) # Display name (for uploads) 

210 original_filename = Column( 

211 String(500), nullable=True 

212 ) # Original upload name 

213 

214 # File information 

215 file_path = Column( 

216 Text, nullable=True 

217 ) # Path relative to library/uploads root 

218 file_size = Column(Integer, nullable=False) # Size in bytes 

219 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc. 

220 mime_type = Column(String(100), nullable=True) # MIME type 

221 

222 # Content storage - text always stored in DB 

223 text_content = Column( 

224 Text, nullable=True 

225 ) # Extracted/uploaded text content 

226 

227 # PDF storage mode (none, filesystem, database) 

228 storage_mode = Column( 

229 String(20), nullable=True, default="database" 

230 ) # PDFStorageMode value 

231 

232 # Metadata 

233 title = Column(Text) # Document title 

234 description = Column(Text) # User description 

235 authors = Column(JSON) # List of authors (for research papers) 

236 published_date = Column(Date, nullable=True) # Publication date 

237 

238 # Academic identifiers (for research papers) 

239 doi = Column(String(255), nullable=True, index=True) 

240 arxiv_id = Column(String(100), nullable=True, index=True) 

241 pmid = Column(String(50), nullable=True, index=True) 

242 pmcid = Column(String(50), nullable=True, index=True) 

243 isbn = Column(String(20), nullable=True) 

244 

245 # Download/Upload information 

246 status = Column( 

247 Enum( 

248 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

249 ), 

250 nullable=False, 

251 default=DocumentStatus.COMPLETED, 

252 ) 

253 attempts = Column(Integer, default=1) 

254 error_message = Column(Text, nullable=True) 

255 processed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

256 last_accessed = Column(UtcDateTime, nullable=True) 

257 

258 # Text extraction metadata (for research downloads from PDFs) 

259 extraction_method = Column( 

260 String(50), nullable=True 

261 ) # pdf_extraction, native_api, etc. 

262 extraction_source = Column( 

263 String(50), nullable=True 

264 ) # arxiv_api, pdfplumber, etc. 

265 extraction_quality = Column(String(20), nullable=True) # high, medium, low 

266 has_formatting_issues = Column(Boolean, default=False) 

267 has_encoding_issues = Column(Boolean, default=False) 

268 character_count = Column(Integer, nullable=True) 

269 word_count = Column(Integer, nullable=True) 

270 

271 # Organization 

272 tags = Column(JSON) # User-defined tags 

273 notes = Column(Text) # User notes 

274 favorite = Column(Boolean, default=False) 

275 

276 # Timestamps 

277 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

278 updated_at = Column( 

279 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

280 ) 

281 

282 # Relationships 

283 source_type = relationship("SourceType", backref="documents") 

284 resource = relationship("ResearchResource", backref="documents") 

285 research = relationship("ResearchHistory", backref="documents") 

286 collections = relationship( 

287 "DocumentCollection", 

288 back_populates="document", 

289 cascade="all, delete-orphan", 

290 ) 

291 

292 # Indexes for efficient queries 

293 __table_args__ = ( 

294 Index("idx_source_type", "source_type_id", "status"), 

295 Index("idx_research_documents", "research_id", "status"), 

296 Index("idx_document_type", "file_type", "status"), 

297 Index("idx_document_hash", "document_hash"), 

298 ) 

299 

300 def __repr__(self): 

301 title_str = ( 

302 self.title[:50] 

303 if self.title 

304 else (self.filename[:50] if self.filename else "Untitled") 

305 ) 

306 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>" 

307 

308 

309class DocumentBlob(Base): 

310 """ 

311 Separate table for storing PDF binary content. 

312 SQLite best practices: keep BLOBs in separate table for better query performance. 

313 Stored in encrypted SQLCipher database for security. 

314 """ 

315 

316 __tablename__ = "document_blobs" 

317 

318 # Primary key references Document.id 

319 document_id = Column( 

320 String(36), 

321 ForeignKey("documents.id", ondelete="CASCADE"), 

322 primary_key=True, 

323 nullable=False, 

324 ) 

325 

326 # Binary PDF content 

327 pdf_binary = Column(LargeBinary, nullable=False) 

328 

329 # Hash for integrity verification 

330 blob_hash = Column(String(64), nullable=True, index=True) # SHA256 

331 

332 # Timestamps 

333 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

334 last_accessed = Column(UtcDateTime, nullable=True) 

335 

336 # Relationship 

337 document = relationship( 

338 "Document", 

339 backref=backref("blob", passive_deletes=True), 

340 passive_deletes=True, 

341 ) 

342 

343 def __repr__(self): 

344 size = len(self.pdf_binary) if self.pdf_binary else 0 

345 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>" 

346 

347 

348class Collection(Base): 

349 """ 

350 Collections for organizing documents. 

351 'Library' is the default collection for research downloads. 

352 Users can create custom collections for organization. 

353 """ 

354 

355 __tablename__ = "collections" 

356 

357 id = Column(String(36), primary_key=True) # UUID as string 

358 name = Column(String(255), nullable=False) 

359 description = Column(Text) 

360 

361 # Collection type (default_library, user_collection, linked_folder) 

362 collection_type = Column(String(50), default="user_collection") 

363 

364 # Is this the default library collection? 

365 is_default = Column(Boolean, default=False) 

366 

367 # Embedding model used for this collection (stored when first indexed) 

368 embedding_model = Column( 

369 String(100), nullable=True 

370 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest' 

371 embedding_model_type = Column( 

372 Enum( 

373 EmbeddingProvider, 

374 values_callable=lambda obj: [e.value for e in obj], 

375 ), 

376 nullable=True, 

377 ) 

378 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

379 chunk_size = Column(Integer, nullable=True) # Chunk size used 

380 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used 

381 

382 # Advanced embedding configuration options (Issue #1054) 

383 splitter_type = Column( 

384 String(50), nullable=True 

385 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

386 text_separators = Column( 

387 JSON, nullable=True 

388 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

389 distance_metric = Column( 

390 String(50), nullable=True 

391 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

392 normalize_vectors = Column( 

393 Boolean, nullable=True 

394 ) # Whether to normalize embeddings with L2 

395 index_type = Column( 

396 String(50), nullable=True 

397 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

398 

399 # Timestamps 

400 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

401 updated_at = Column( 

402 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

403 ) 

404 

405 # Relationships 

406 document_links = relationship( 

407 "DocumentCollection", 

408 back_populates="collection", 

409 cascade="all, delete-orphan", 

410 ) 

411 linked_folders = relationship( 

412 "CollectionFolder", 

413 back_populates="collection", 

414 cascade="all, delete-orphan", 

415 ) 

416 

417 def __repr__(self): 

418 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>" 

419 

420 

421class DocumentCollection(Base): 

422 """ 

423 Many-to-many relationship between documents and collections. 

424 Tracks indexing status per collection (documents can be in multiple collections). 

425 """ 

426 

427 __tablename__ = "document_collections" 

428 

429 id = Column(Integer, primary_key=True, autoincrement=True) 

430 

431 # Foreign keys 

432 document_id = Column( 

433 String(36), 

434 ForeignKey("documents.id", ondelete="CASCADE"), 

435 nullable=False, 

436 index=True, 

437 ) 

438 collection_id = Column( 

439 String(36), 

440 ForeignKey("collections.id", ondelete="CASCADE"), 

441 nullable=False, 

442 index=True, 

443 ) 

444 

445 # Indexing status (per collection!) 

446 indexed = Column( 

447 Boolean, default=False 

448 ) # Whether indexed for this collection 

449 chunk_count = Column( 

450 Integer, default=0 

451 ) # Number of chunks in this collection 

452 last_indexed_at = Column(UtcDateTime, nullable=True) 

453 

454 # Timestamps 

455 added_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

456 

457 # Relationships 

458 document = relationship("Document", back_populates="collections") 

459 collection = relationship("Collection", back_populates="document_links") 

460 

461 # Ensure one entry per document-collection pair 

462 __table_args__ = ( 

463 UniqueConstraint( 

464 "document_id", "collection_id", name="uix_document_collection" 

465 ), 

466 Index("idx_collection_indexed", "collection_id", "indexed"), 

467 ) 

468 

469 def __repr__(self): 

470 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>" 

471 

472 

473class DocumentChunk(Base): 

474 """ 

475 Universal chunk storage for RAG across all sources. 

476 Stores text chunks in encrypted database for semantic search. 

477 """ 

478 

479 __tablename__ = "document_chunks" 

480 

481 id = Column(Integer, primary_key=True, autoincrement=True) 

482 

483 # Chunk identification 

484 chunk_hash = Column( 

485 String(64), nullable=False, index=True 

486 ) # SHA256 for deduplication 

487 

488 # Source tracking - now points to unified Document table 

489 source_type = Column( 

490 String(20), nullable=False, index=True 

491 ) # 'document', 'folder_file' 

492 source_id = Column( 

493 String(36), nullable=True, index=True 

494 ) # Document.id (UUID as string) 

495 source_path = Column( 

496 Text, nullable=True 

497 ) # File path if local collection source 

498 collection_name = Column( 

499 String(100), nullable=False, index=True 

500 ) # collection_<uuid> 

501 

502 # Chunk content (encrypted in SQLCipher DB) 

503 chunk_text = Column(Text, nullable=False) # The actual chunk text 

504 chunk_index = Column(Integer, nullable=False) # Position in source document 

505 start_char = Column(Integer, nullable=False) # Start character position 

506 end_char = Column(Integer, nullable=False) # End character position 

507 word_count = Column(Integer, nullable=False) # Number of words in chunk 

508 

509 # Embedding metadata 

510 embedding_id = Column( 

511 String(36), nullable=False, unique=True, index=True 

512 ) # UUID for FAISS vector mapping 

513 embedding_model = Column( 

514 String(100), nullable=False 

515 ) # e.g., 'all-MiniLM-L6-v2' 

516 embedding_model_type = Column( 

517 Enum( 

518 EmbeddingProvider, 

519 values_callable=lambda obj: [e.value for e in obj], 

520 ), 

521 nullable=False, 

522 ) 

523 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

524 

525 # Document metadata (for context) 

526 document_title = Column(Text, nullable=True) # Title of source document 

527 document_metadata = Column( 

528 JSON, nullable=True 

529 ) # Additional metadata from source 

530 

531 # Timestamps 

532 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

533 last_accessed = Column(UtcDateTime, nullable=True) 

534 

535 # Indexes for efficient queries 

536 __table_args__ = ( 

537 UniqueConstraint( 

538 "chunk_hash", "collection_name", name="uix_chunk_collection" 

539 ), 

540 Index("idx_chunk_source", "source_type", "source_id"), 

541 Index("idx_chunk_collection", "collection_name", "created_at"), 

542 Index("idx_chunk_embedding", "embedding_id"), 

543 ) 

544 

545 def __repr__(self): 

546 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>" 

547 

548 

549class DownloadQueue(Base): 

550 """ 

551 Queue for pending document downloads. 

552 Renamed from LibraryDownloadQueue for consistency. 

553 """ 

554 

555 __tablename__ = "download_queue" 

556 

557 id = Column(Integer, primary_key=True, autoincrement=True) 

558 

559 # What to download 

560 resource_id = Column( 

561 Integer, 

562 ForeignKey("research_resources.id", ondelete="CASCADE"), 

563 nullable=False, 

564 unique=True, # One queue entry per resource 

565 ) 

566 research_id = Column(String(36), nullable=False, index=True) 

567 

568 # Target collection (defaults to Library collection) 

569 collection_id = Column( 

570 String(36), 

571 ForeignKey("collections.id", ondelete="SET NULL"), 

572 nullable=True, 

573 index=True, 

574 ) 

575 

576 # Queue management 

577 priority = Column(Integer, default=0) # Higher = more important 

578 status = Column( 

579 Enum( 

580 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

581 ), 

582 nullable=False, 

583 default=DocumentStatus.PENDING, 

584 ) 

585 attempts = Column(Integer, default=0) 

586 max_attempts = Column(Integer, default=3) 

587 

588 # Error tracking 

589 last_error = Column(Text, nullable=True) 

590 last_attempt_at = Column(UtcDateTime, nullable=True) 

591 

592 # Timestamps 

593 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

594 completed_at = Column(UtcDateTime, nullable=True) 

595 

596 # Relationships 

597 resource = relationship("ResearchResource", backref="download_queue") 

598 collection = relationship("Collection", backref="download_queue_items") 

599 

600 def __repr__(self): 

601 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>" 

602 

603 

604class LibraryStatistics(Base): 

605 """ 

606 Aggregate statistics for the library. 

607 Updated periodically for dashboard display. 

608 """ 

609 

610 __tablename__ = "library_statistics" 

611 

612 id = Column(Integer, primary_key=True, autoincrement=True) 

613 

614 # Document counts 

615 total_documents = Column(Integer, default=0) 

616 total_pdfs = Column(Integer, default=0) 

617 total_html = Column(Integer, default=0) 

618 total_other = Column(Integer, default=0) 

619 

620 # Storage metrics 

621 total_size_bytes = Column(Integer, default=0) 

622 average_document_size = Column(Integer, default=0) 

623 

624 # Research metrics 

625 total_researches_with_downloads = Column(Integer, default=0) 

626 average_documents_per_research = Column(Integer, default=0) 

627 

628 # Download metrics 

629 total_download_attempts = Column(Integer, default=0) 

630 successful_downloads = Column(Integer, default=0) 

631 failed_downloads = Column(Integer, default=0) 

632 pending_downloads = Column(Integer, default=0) 

633 

634 # Academic sources breakdown 

635 arxiv_count = Column(Integer, default=0) 

636 pubmed_count = Column(Integer, default=0) 

637 doi_count = Column(Integer, default=0) 

638 other_count = Column(Integer, default=0) 

639 

640 # Timestamps 

641 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

642 

643 def __repr__(self): 

644 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>" 

645 

646 

647class RAGIndex(Base): 

648 """ 

649 Tracks FAISS indices for RAG collections. 

650 Each collection+embedding_model combination has its own FAISS index. 

651 """ 

652 

653 __tablename__ = "rag_indices" 

654 

655 id = Column(Integer, primary_key=True, autoincrement=True) 

656 

657 # Collection and model identification 

658 collection_name = Column( 

659 String(100), nullable=False, index=True 

660 ) # 'collection_<uuid>' 

661 embedding_model = Column( 

662 String(100), nullable=False 

663 ) # e.g., 'all-MiniLM-L6-v2' 

664 embedding_model_type = Column( 

665 Enum( 

666 EmbeddingProvider, 

667 values_callable=lambda obj: [e.value for e in obj], 

668 ), 

669 nullable=False, 

670 ) 

671 embedding_dimension = Column(Integer, nullable=False) # Vector dimension 

672 

673 # Index file location 

674 index_path = Column(Text, nullable=False) # Path to .faiss file 

675 index_hash = Column( 

676 String(64), nullable=False, unique=True, index=True 

677 ) # SHA256 of collection+model for uniqueness 

678 

679 # Chunking parameters used 

680 chunk_size = Column(Integer, nullable=False) 

681 chunk_overlap = Column(Integer, nullable=False) 

682 

683 # Advanced embedding configuration options (Issue #1054) 

684 splitter_type = Column( 

685 String(50), nullable=True 

686 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

687 text_separators = Column( 

688 JSON, nullable=True 

689 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

690 distance_metric = Column( 

691 String(50), nullable=True 

692 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

693 normalize_vectors = Column( 

694 Boolean, nullable=True 

695 ) # Whether to normalize embeddings with L2 

696 index_type = Column( 

697 String(50), nullable=True 

698 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

699 

700 # Index statistics 

701 chunk_count = Column(Integer, default=0) # Number of chunks in this index 

702 total_documents = Column(Integer, default=0) # Number of source documents 

703 

704 # Status 

705 status = Column( 

706 Enum( 

707 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj] 

708 ), 

709 nullable=False, 

710 default=RAGIndexStatus.ACTIVE, 

711 ) 

712 is_current = Column( 

713 Boolean, default=True 

714 ) # Whether this is the current index for this collection 

715 

716 # Timestamps 

717 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

718 last_updated_at = Column( 

719 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

720 ) 

721 last_used_at = Column( 

722 UtcDateTime, nullable=True 

723 ) # Last time index was searched 

724 

725 # Ensure one active index per collection+model 

726 __table_args__ = ( 

727 UniqueConstraint( 

728 "collection_name", 

729 "embedding_model", 

730 "embedding_model_type", 

731 name="uix_collection_model", 

732 ), 

733 Index("idx_collection_current", "collection_name", "is_current"), 

734 ) 

735 

736 def __repr__(self): 

737 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>" 

738 

739 

740class RagDocumentStatus(Base): 

741 """ 

742 Tracks which documents have been indexed for RAG. 

743 Row existence = document is indexed. No row = not indexed. 

744 Simple and avoids ORM caching issues. 

745 """ 

746 

747 __tablename__ = "rag_document_status" 

748 

749 # Composite primary key 

750 document_id = Column( 

751 String(36), 

752 ForeignKey("documents.id", ondelete="CASCADE"), 

753 primary_key=True, 

754 nullable=False, 

755 ) 

756 collection_id = Column( 

757 String(36), 

758 ForeignKey("collections.id", ondelete="CASCADE"), 

759 primary_key=True, 

760 nullable=False, 

761 ) 

762 

763 # Which RAG index was used (tracks embedding model indirectly) 

764 rag_index_id = Column( 

765 Integer, 

766 ForeignKey("rag_indices.id", ondelete="CASCADE"), 

767 nullable=False, 

768 index=True, 

769 ) 

770 

771 # Metadata 

772 chunk_count = Column(Integer, nullable=False) 

773 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

774 

775 # Indexes for fast lookups 

776 __table_args__ = ( 

777 Index("idx_rag_status_collection", "collection_id"), 

778 Index("idx_rag_status_index", "rag_index_id"), 

779 ) 

780 

781 def __repr__(self): 

782 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>" 

783 

784 

785class CollectionFolder(Base): 

786 """ 

787 Local folders linked to a collection for indexing. 

788 """ 

789 

790 __tablename__ = "collection_folders" 

791 

792 id = Column(Integer, primary_key=True, autoincrement=True) 

793 

794 # Collection association 

795 collection_id = Column( 

796 String(36), 

797 ForeignKey("collections.id", ondelete="CASCADE"), 

798 nullable=False, 

799 index=True, 

800 ) 

801 

802 # Folder configuration 

803 folder_path = Column(Text, nullable=False) # Absolute path to folder 

804 include_patterns = Column( 

805 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"] 

806 ) # File patterns to include 

807 exclude_patterns = Column( 

808 JSON 

809 ) # Patterns to exclude (e.g., ["**/node_modules/**"]) 

810 recursive = Column(Boolean, default=True) # Search subfolders 

811 

812 # Monitoring 

813 watch_enabled = Column( 

814 Boolean, default=False 

815 ) # Auto-reindex on changes (future) 

816 last_scanned_at = Column(UtcDateTime, nullable=True) 

817 file_count = Column(Integer, default=0) # Total files found 

818 indexed_file_count = Column(Integer, default=0) # Files indexed 

819 

820 # Timestamps 

821 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

822 updated_at = Column( 

823 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

824 ) 

825 

826 # Relationships 

827 collection = relationship("Collection", back_populates="linked_folders") 

828 files = relationship( 

829 "CollectionFolderFile", 

830 back_populates="folder", 

831 cascade="all, delete-orphan", 

832 ) 

833 

834 def __repr__(self): 

835 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>" 

836 

837 

838class CollectionFolderFile(Base): 

839 """ 

840 Files found in linked folders. 

841 Lightweight tracking for deduplication and indexing status. 

842 """ 

843 

844 __tablename__ = "collection_folder_files" 

845 

846 id = Column(Integer, primary_key=True, autoincrement=True) 

847 

848 # Folder association 

849 folder_id = Column( 

850 Integer, 

851 ForeignKey("collection_folders.id", ondelete="CASCADE"), 

852 nullable=False, 

853 index=True, 

854 ) 

855 

856 # File identification 

857 relative_path = Column(Text, nullable=False) # Path relative to folder_path 

858 file_hash = Column(String(64), index=True) # SHA256 for deduplication 

859 file_size = Column(Integer) # Size in bytes 

860 file_type = Column(String(50)) # Extension 

861 

862 # File metadata 

863 last_modified = Column(UtcDateTime) # File modification time 

864 

865 # Indexing status 

866 indexed = Column(Boolean, default=False) 

867 chunk_count = Column(Integer, default=0) 

868 last_indexed_at = Column(UtcDateTime, nullable=True) 

869 index_error = Column(Text, nullable=True) # Error if indexing failed 

870 

871 # Timestamps 

872 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

873 updated_at = Column( 

874 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

875 ) 

876 

877 # Relationships 

878 folder = relationship("CollectionFolder", back_populates="files") 

879 

880 # Ensure one entry per file in folder 

881 __table_args__ = ( 

882 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"), 

883 Index("idx_folder_indexed", "folder_id", "indexed"), 

884 ) 

885 

886 def __repr__(self): 

887 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"