Coverage for src/local_deep_research/database/models/library.py: 99%

307 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Library and document models - Unified architecture. 

3All documents (research downloads and user uploads) are stored in one table. 

4Collections organize documents, with "Library" as the default collection. 

5""" 

6 

7import enum 

8 

9from sqlalchemy import ( 

10 JSON, 

11 Boolean, 

12 Column, 

13 Date, 

14 Enum, 

15 ForeignKey, 

16 Index, 

17 Integer, 

18 LargeBinary, 

19 String, 

20 Text, 

21 UniqueConstraint, 

22) 

23from sqlalchemy.orm import backref, relationship 

24from sqlalchemy_utc import UtcDateTime, utcnow 

25 

26from .base import Base 

27 

28 

29class RAGIndexStatus(enum.Enum): 

30 """Status values for RAG indices.""" 

31 

32 ACTIVE = "active" 

33 REBUILDING = "rebuilding" 

34 DEPRECATED = "deprecated" 

35 

36 

37class DocumentStatus(enum.Enum): 

38 """Status values for document processing and downloads.""" 

39 

40 PENDING = "pending" 

41 PROCESSING = "processing" 

42 COMPLETED = "completed" 

43 FAILED = "failed" 

44 

45 

46class EmbeddingProvider(enum.Enum): 

47 """Embedding model provider types. 

48 

49 OPENAI covers both the OpenAI cloud API and any OpenAI-compatible 

50 endpoint (LM Studio, vLLM, llama.cpp server, etc.) — the underlying 

51 provider class reads ``embeddings.openai.base_url`` to target a local 

52 server when set, falling back to the OpenAI cloud when unset. 

53 """ 

54 

55 SENTENCE_TRANSFORMERS = "sentence_transformers" 

56 OLLAMA = "ollama" 

57 OPENAI = "openai" 

58 

59 

60class ExtractionMethod(str, enum.Enum): 

61 """Methods used to extract text from documents.""" 

62 

63 PDF_EXTRACTION = "pdf_extraction" 

64 NATIVE_API = "native_api" 

65 UNKNOWN = "unknown" 

66 

67 

68class ExtractionSource(str, enum.Enum): 

69 """Sources used for text extraction.""" 

70 

71 ARXIV_API = "arxiv_api" 

72 PUBMED_API = "pubmed_api" 

73 PDFPLUMBER = "pdfplumber" 

74 PDFPLUMBER_FALLBACK = "pdfplumber_fallback" 

75 LOCAL_PDF = "local_pdf" 

76 LEGACY_FILE = "legacy_file" 

77 

78 

79class ExtractionQuality(str, enum.Enum): 

80 """Quality levels for extracted text.""" 

81 

82 HIGH = "high" 

83 MEDIUM = "medium" 

84 LOW = "low" 

85 

86 

87class DistanceMetric(str, enum.Enum): 

88 """Distance metrics for vector similarity search.""" 

89 

90 COSINE = "cosine" 

91 L2 = "l2" 

92 DOT_PRODUCT = "dot_product" 

93 

94 

95class IndexType(str, enum.Enum): 

96 """FAISS index types for RAG.""" 

97 

98 FLAT = "flat" 

99 HNSW = "hnsw" 

100 IVF = "ivf" 

101 

102 

103class SplitterType(str, enum.Enum): 

104 """Text splitter types for chunking.""" 

105 

106 RECURSIVE = "recursive" 

107 SEMANTIC = "semantic" 

108 TOKEN = "token" 

109 SENTENCE = "sentence" 

110 

111 

112class PDFStorageMode(str, enum.Enum): 

113 """Storage modes for PDF files.""" 

114 

115 NONE = "none" # Don't store PDFs, text-only 

116 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem 

117 DATABASE = "database" # Store PDFs encrypted in database 

118 

119 

120class SourceType(Base): 

121 """ 

122 Document source types (research_download, user_upload, manual_entry, etc.). 

123 Normalized table for consistent categorization. 

124 """ 

125 

126 __tablename__ = "source_types" 

127 

128 id = Column(String(36), primary_key=True) # UUID 

129 name = Column(String(50), nullable=False, unique=True, index=True) 

130 display_name = Column(String(100), nullable=False) 

131 description = Column(Text) 

132 icon = Column(String(50)) # Icon name for UI 

133 

134 # Timestamps 

135 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

136 

137 def __repr__(self): 

138 return ( 

139 f"<SourceType(name='{self.name}', display='{self.display_name}')>" 

140 ) 

141 

142 

143class UploadBatch(Base): 

144 """ 

145 Tracks batches of user-uploaded files. 

146 Groups uploads for traceability and batch operations. 

147 

148 TODO: As of 2026-05 this table is dormant — no code path creates 

149 UploadBatch rows or sets Document.upload_batch_id (declared below at 

150 column ``upload_batch_id``). Wiring it up needs a product decision on 

151 what defines a "batch" (per upload submit, per UI session, etc.) and 

152 surfacing the grouping in the upload routes / UI. Schema is left in 

153 place because it's harmless and removing it would require a migration 

154 against every user's per-user encrypted DB. 

155 """ 

156 

157 __tablename__ = "upload_batches" 

158 

159 id = Column(String(36), primary_key=True) # UUID 

160 collection_id = Column( 

161 String(36), 

162 ForeignKey("collections.id", ondelete="CASCADE"), 

163 nullable=False, 

164 index=True, 

165 ) 

166 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

167 file_count = Column(Integer, default=0) 

168 total_size = Column(Integer, default=0) # Total bytes 

169 

170 # Relationships 

171 collection = relationship("Collection", backref="upload_batches") 

172 documents = relationship("Document", backref="upload_batch") 

173 

174 def __repr__(self): 

175 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>" 

176 

177 

178class Document(Base): 

179 """ 

180 Unified document table for all documents (research downloads + user uploads). 

181 """ 

182 

183 __tablename__ = "documents" 

184 

185 id = Column(String(36), primary_key=True) # UUID as string 

186 

187 # Source type (research_download, user_upload, etc.) 

188 source_type_id = Column( 

189 String(36), 

190 ForeignKey("source_types.id"), 

191 nullable=False, 

192 index=True, 

193 ) 

194 

195 # Link to original research resource (for research downloads) - nullable for uploads 

196 resource_id = Column( 

197 Integer, 

198 ForeignKey("research_resources.id", ondelete="SET NULL"), 

199 nullable=True, 

200 index=True, 

201 ) 

202 

203 # Link to research (for research downloads) - nullable for uploads 

204 research_id = Column( 

205 String(36), 

206 ForeignKey("research_history.id", ondelete="CASCADE"), 

207 nullable=True, 

208 index=True, 

209 ) 

210 

211 # Link to upload batch (for user uploads) - nullable for research downloads 

212 upload_batch_id = Column( 

213 String(36), 

214 ForeignKey("upload_batches.id", ondelete="SET NULL"), 

215 nullable=True, 

216 index=True, 

217 ) 

218 

219 # Document identification 

220 document_hash = Column( 

221 String(64), nullable=False, unique=True, index=True 

222 ) # SHA256 for deduplication 

223 original_url = Column(Text, nullable=True) # Source URL (for downloads) 

224 filename = Column(String(500), nullable=True) # Display name (for uploads) 

225 original_filename = Column( 

226 String(500), nullable=True 

227 ) # Original upload name 

228 

229 # File information 

230 file_path = Column( 

231 Text, nullable=True 

232 ) # Path relative to library/uploads root 

233 file_size = Column(Integer, nullable=False) # Size in bytes 

234 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc. 

235 mime_type = Column(String(100), nullable=True) # MIME type 

236 

237 # Content storage - text always stored in DB 

238 text_content = Column( 

239 Text, nullable=True 

240 ) # Extracted/uploaded text content 

241 

242 # PDF storage mode (none, filesystem, database) 

243 storage_mode = Column( 

244 String(20), nullable=True, default="database" 

245 ) # PDFStorageMode value 

246 

247 # Metadata 

248 title = Column(Text) # Document title 

249 description = Column(Text) # User description 

250 authors = Column(JSON) # List of authors (for research papers) 

251 published_date = Column(Date, nullable=True) # Publication date 

252 

253 # Academic identifiers (for research papers) 

254 doi = Column(String(255), nullable=True, index=True) 

255 arxiv_id = Column(String(100), nullable=True, index=True) 

256 pmid = Column(String(50), nullable=True, index=True) 

257 pmcid = Column(String(50), nullable=True, index=True) 

258 isbn = Column(String(20), nullable=True) 

259 

260 # Download/Upload information 

261 status = Column( 

262 Enum( 

263 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

264 ), 

265 nullable=False, 

266 default=DocumentStatus.COMPLETED, 

267 ) 

268 attempts = Column(Integer, default=1) 

269 error_message = Column(Text, nullable=True) 

270 processed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

271 last_accessed = Column(UtcDateTime, nullable=True) 

272 

273 # Text extraction metadata (for research downloads from PDFs) 

274 extraction_method = Column( 

275 String(50), nullable=True 

276 ) # pdf_extraction, native_api, etc. 

277 extraction_source = Column( 

278 String(50), nullable=True 

279 ) # arxiv_api, pdfplumber, etc. 

280 extraction_quality = Column(String(20), nullable=True) # high, medium, low 

281 has_formatting_issues = Column(Boolean, default=False) 

282 has_encoding_issues = Column(Boolean, default=False) 

283 character_count = Column(Integer, nullable=True) 

284 word_count = Column(Integer, nullable=True) 

285 

286 # Organization 

287 tags = Column(JSON) # User-defined tags 

288 notes = Column(Text) # User notes 

289 favorite = Column(Boolean, default=False) 

290 

291 # Timestamps 

292 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

293 updated_at = Column( 

294 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

295 ) 

296 

297 # Relationships 

298 source_type = relationship("SourceType", backref="documents") 

299 resource = relationship( 

300 "ResearchResource", 

301 foreign_keys="[Document.resource_id]", 

302 backref="documents", 

303 ) 

304 research = relationship("ResearchHistory", backref="documents") 

305 collections = relationship( 

306 "DocumentCollection", 

307 back_populates="document", 

308 cascade="all, delete-orphan", 

309 ) 

310 

311 # Indexes for efficient queries 

312 __table_args__ = ( 

313 Index("idx_source_type", "source_type_id", "status"), 

314 Index("idx_research_documents", "research_id", "status"), 

315 Index("idx_document_type", "file_type", "status"), 

316 Index("idx_document_hash", "document_hash"), 

317 ) 

318 

319 def __repr__(self): 

320 title_str = ( 

321 self.title[:50] 

322 if self.title 

323 else (self.filename[:50] if self.filename else "Untitled") 

324 ) 

325 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>" 

326 

327 

328class DocumentBlob(Base): 

329 """ 

330 Separate table for storing PDF binary content. 

331 SQLite best practices: keep BLOBs in separate table for better query performance. 

332 Stored in encrypted SQLCipher database for security. 

333 """ 

334 

335 __tablename__ = "document_blobs" 

336 

337 # Primary key references Document.id 

338 document_id = Column( 

339 String(36), 

340 ForeignKey("documents.id", ondelete="CASCADE"), 

341 primary_key=True, 

342 nullable=False, 

343 ) 

344 

345 # Binary PDF content 

346 pdf_binary = Column(LargeBinary, nullable=False) 

347 

348 # Hash for integrity verification 

349 blob_hash = Column(String(64), nullable=True, index=True) # SHA256 

350 

351 # Timestamps 

352 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

353 last_accessed = Column(UtcDateTime, nullable=True) 

354 

355 # Relationship 

356 document = relationship( 

357 "Document", 

358 backref=backref("blob", passive_deletes=True), 

359 passive_deletes=True, 

360 ) 

361 

362 def __repr__(self): 

363 size = len(self.pdf_binary) if self.pdf_binary else 0 

364 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>" 

365 

366 

367class Collection(Base): 

368 """ 

369 Collections for organizing documents. 

370 'Library' is the default collection for research downloads. 

371 Users can create custom collections for organization. 

372 """ 

373 

374 __tablename__ = "collections" 

375 

376 id = Column(String(36), primary_key=True) # UUID as string 

377 name = Column(String(255), nullable=False) 

378 description = Column(Text) 

379 

380 # Collection type (default_library, user_collection, linked_folder) 

381 collection_type = Column(String(50), default="user_collection") 

382 

383 # Is this the default library collection? 

384 is_default = Column(Boolean, default=False) 

385 

386 # Embedding model used for this collection (stored when first indexed) 

387 embedding_model = Column( 

388 String(100), nullable=True 

389 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest' 

390 embedding_model_type = Column( 

391 Enum( 

392 EmbeddingProvider, 

393 values_callable=lambda obj: [e.value for e in obj], 

394 ), 

395 nullable=True, 

396 ) 

397 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

398 chunk_size = Column(Integer, nullable=True) # Chunk size used 

399 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used 

400 

401 # Advanced embedding configuration options (Issue #1054) 

402 splitter_type = Column( 

403 String(50), nullable=True 

404 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

405 text_separators = Column( 

406 JSON, nullable=True 

407 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

408 distance_metric = Column( 

409 String(50), nullable=True 

410 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

411 normalize_vectors = Column( 

412 Boolean, nullable=True 

413 ) # Whether to normalize embeddings with L2 

414 index_type = Column( 

415 String(50), nullable=True 

416 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

417 

418 # Timestamps 

419 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

420 updated_at = Column( 

421 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

422 ) 

423 

424 # Relationships 

425 document_links = relationship( 

426 "DocumentCollection", 

427 back_populates="collection", 

428 cascade="all, delete-orphan", 

429 ) 

430 linked_folders = relationship( 

431 "CollectionFolder", 

432 back_populates="collection", 

433 cascade="all, delete-orphan", 

434 ) 

435 

436 def __repr__(self): 

437 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>" 

438 

439 

440class DocumentCollection(Base): 

441 """ 

442 Many-to-many relationship between documents and collections. 

443 Tracks indexing status per collection (documents can be in multiple collections). 

444 """ 

445 

446 __tablename__ = "document_collections" 

447 

448 id = Column(Integer, primary_key=True, autoincrement=True) 

449 

450 # Foreign keys 

451 document_id = Column( 

452 String(36), 

453 ForeignKey("documents.id", ondelete="CASCADE"), 

454 nullable=False, 

455 index=True, 

456 ) 

457 collection_id = Column( 

458 String(36), 

459 ForeignKey("collections.id", ondelete="CASCADE"), 

460 nullable=False, 

461 index=True, 

462 ) 

463 

464 # Indexing status (per collection!) 

465 indexed = Column( 

466 Boolean, default=False 

467 ) # Whether indexed for this collection 

468 chunk_count = Column( 

469 Integer, default=0 

470 ) # Number of chunks in this collection 

471 last_indexed_at = Column(UtcDateTime, nullable=True) 

472 

473 # Timestamps 

474 added_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

475 

476 # Relationships 

477 document = relationship("Document", back_populates="collections") 

478 collection = relationship("Collection", back_populates="document_links") 

479 

480 # Ensure one entry per document-collection pair 

481 __table_args__ = ( 

482 UniqueConstraint( 

483 "document_id", "collection_id", name="uix_document_collection" 

484 ), 

485 Index("idx_collection_indexed", "collection_id", "indexed"), 

486 ) 

487 

488 def __repr__(self): 

489 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>" 

490 

491 

492class DocumentChunk(Base): 

493 """ 

494 Universal chunk storage for RAG across all sources. 

495 Stores text chunks in encrypted database for semantic search. 

496 """ 

497 

498 __tablename__ = "document_chunks" 

499 

500 id = Column(Integer, primary_key=True, autoincrement=True) 

501 

502 # Chunk identification 

503 chunk_hash = Column( 

504 String(64), nullable=False, index=True 

505 ) # SHA256 for deduplication 

506 

507 # Source tracking - now points to unified Document table 

508 source_type = Column( 

509 String(20), nullable=False, index=True 

510 ) # 'document', 'folder_file' 

511 source_id = Column( 

512 String(36), nullable=True, index=True 

513 ) # Document.id (UUID as string) 

514 source_path = Column( 

515 Text, nullable=True 

516 ) # File path if local collection source 

517 collection_name = Column( 

518 String(100), nullable=False, index=True 

519 ) # collection_<uuid> 

520 

521 # Chunk content (encrypted in SQLCipher DB) 

522 chunk_text = Column(Text, nullable=False) # The actual chunk text 

523 chunk_index = Column(Integer, nullable=False) # Position in source document 

524 start_char = Column(Integer, nullable=False) # Start character position 

525 end_char = Column(Integer, nullable=False) # End character position 

526 word_count = Column(Integer, nullable=False) # Number of words in chunk 

527 

528 # Embedding metadata 

529 embedding_id = Column( 

530 String(36), nullable=False, unique=True, index=True 

531 ) # UUID for FAISS vector mapping 

532 embedding_model = Column( 

533 String(100), nullable=False 

534 ) # e.g., 'all-MiniLM-L6-v2' 

535 embedding_model_type = Column( 

536 Enum( 

537 EmbeddingProvider, 

538 values_callable=lambda obj: [e.value for e in obj], 

539 ), 

540 nullable=False, 

541 ) 

542 embedding_dimension = Column(Integer, nullable=True) # Vector dimension 

543 

544 # Document metadata (for context) 

545 document_title = Column(Text, nullable=True) # Title of source document 

546 document_metadata = Column( 

547 JSON, nullable=True 

548 ) # Additional metadata from source 

549 

550 # Timestamps 

551 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

552 last_accessed = Column(UtcDateTime, nullable=True) 

553 

554 # Indexes for efficient queries 

555 __table_args__ = ( 

556 UniqueConstraint( 

557 "chunk_hash", "collection_name", name="uix_chunk_collection" 

558 ), 

559 Index("idx_chunk_source", "source_type", "source_id"), 

560 Index("idx_chunk_collection", "collection_name", "created_at"), 

561 Index("idx_chunk_embedding", "embedding_id"), 

562 ) 

563 

564 def __repr__(self): 

565 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>" 

566 

567 

568class DownloadQueue(Base): 

569 """ 

570 Queue for pending document downloads. 

571 Renamed from LibraryDownloadQueue for consistency. 

572 """ 

573 

574 __tablename__ = "download_queue" 

575 

576 id = Column(Integer, primary_key=True, autoincrement=True) 

577 

578 # What to download 

579 resource_id = Column( 

580 Integer, 

581 ForeignKey("research_resources.id", ondelete="CASCADE"), 

582 nullable=False, 

583 unique=True, # One queue entry per resource 

584 ) 

585 research_id = Column(String(36), nullable=False, index=True) 

586 

587 # Target collection (defaults to Library collection) 

588 collection_id = Column( 

589 String(36), 

590 ForeignKey("collections.id", ondelete="SET NULL"), 

591 nullable=True, 

592 index=True, 

593 ) 

594 

595 # Queue management 

596 priority = Column(Integer, default=0) # Higher = more important 

597 status = Column( 

598 Enum( 

599 DocumentStatus, values_callable=lambda obj: [e.value for e in obj] 

600 ), 

601 nullable=False, 

602 default=DocumentStatus.PENDING, 

603 ) 

604 attempts = Column(Integer, default=0) 

605 max_attempts = Column(Integer, default=3) 

606 

607 # Error tracking 

608 last_error = Column(Text, nullable=True) 

609 last_attempt_at = Column(UtcDateTime, nullable=True) 

610 

611 # Timestamps 

612 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

613 completed_at = Column(UtcDateTime, nullable=True) 

614 

615 # Relationships 

616 resource = relationship("ResearchResource", backref="download_queue") 

617 collection = relationship("Collection", backref="download_queue_items") 

618 

619 def __repr__(self): 

620 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>" 

621 

622 

623class LibraryStatistics(Base): 

624 """ 

625 Aggregate statistics for the library. 

626 Updated periodically for dashboard display. 

627 """ 

628 

629 __tablename__ = "library_statistics" 

630 

631 id = Column(Integer, primary_key=True, autoincrement=True) 

632 

633 # Document counts 

634 total_documents = Column(Integer, default=0) 

635 total_pdfs = Column(Integer, default=0) 

636 total_html = Column(Integer, default=0) 

637 total_other = Column(Integer, default=0) 

638 

639 # Storage metrics 

640 total_size_bytes = Column(Integer, default=0) 

641 average_document_size = Column(Integer, default=0) 

642 

643 # Research metrics 

644 total_researches_with_downloads = Column(Integer, default=0) 

645 average_documents_per_research = Column(Integer, default=0) 

646 

647 # Download metrics 

648 total_download_attempts = Column(Integer, default=0) 

649 successful_downloads = Column(Integer, default=0) 

650 failed_downloads = Column(Integer, default=0) 

651 pending_downloads = Column(Integer, default=0) 

652 

653 # Academic sources breakdown 

654 arxiv_count = Column(Integer, default=0) 

655 pubmed_count = Column(Integer, default=0) 

656 doi_count = Column(Integer, default=0) 

657 other_count = Column(Integer, default=0) 

658 

659 # Timestamps 

660 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

661 

662 def __repr__(self): 

663 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>" 

664 

665 

666class RAGIndex(Base): 

667 """ 

668 Tracks FAISS indices for RAG collections. 

669 Each collection+embedding_model combination has its own FAISS index. 

670 """ 

671 

672 __tablename__ = "rag_indices" 

673 

674 id = Column(Integer, primary_key=True, autoincrement=True) 

675 

676 # Collection and model identification 

677 collection_name = Column( 

678 String(100), nullable=False, index=True 

679 ) # 'collection_<uuid>' 

680 embedding_model = Column( 

681 String(100), nullable=False 

682 ) # e.g., 'all-MiniLM-L6-v2' 

683 embedding_model_type = Column( 

684 Enum( 

685 EmbeddingProvider, 

686 values_callable=lambda obj: [e.value for e in obj], 

687 ), 

688 nullable=False, 

689 ) 

690 embedding_dimension = Column(Integer, nullable=False) # Vector dimension 

691 

692 # Index file location 

693 index_path = Column(Text, nullable=False) # Path to .faiss file 

694 index_hash = Column( 

695 String(64), nullable=False, unique=True, index=True 

696 ) # SHA256 of collection+model for uniqueness 

697 

698 # Chunking parameters used 

699 chunk_size = Column(Integer, nullable=False) 

700 chunk_overlap = Column(Integer, nullable=False) 

701 

702 # Advanced embedding configuration options (Issue #1054) 

703 splitter_type = Column( 

704 String(50), nullable=True 

705 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence' 

706 text_separators = Column( 

707 JSON, nullable=True 

708 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""] 

709 distance_metric = Column( 

710 String(50), nullable=True 

711 ) # Distance metric: 'cosine', 'l2', 'dot_product' 

712 normalize_vectors = Column( 

713 Boolean, nullable=True 

714 ) # Whether to normalize embeddings with L2 

715 index_type = Column( 

716 String(50), nullable=True 

717 ) # FAISS index type: 'flat', 'hnsw', 'ivf' 

718 

719 # Index statistics 

720 chunk_count = Column(Integer, default=0) # Number of chunks in this index 

721 total_documents = Column(Integer, default=0) # Number of source documents 

722 

723 # Status 

724 status = Column( 

725 Enum( 

726 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj] 

727 ), 

728 nullable=False, 

729 default=RAGIndexStatus.ACTIVE, 

730 ) 

731 is_current = Column( 

732 Boolean, default=True 

733 ) # Whether this is the current index for this collection 

734 

735 # Timestamps 

736 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

737 last_updated_at = Column( 

738 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

739 ) 

740 last_used_at = Column( 

741 UtcDateTime, nullable=True 

742 ) # Last time index was searched 

743 

744 # Ensure one active index per collection+model 

745 __table_args__ = ( 

746 UniqueConstraint( 

747 "collection_name", 

748 "embedding_model", 

749 "embedding_model_type", 

750 name="uix_collection_model", 

751 ), 

752 Index("idx_collection_current", "collection_name", "is_current"), 

753 ) 

754 

755 def __repr__(self): 

756 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>" 

757 

758 

759class RagDocumentStatus(Base): 

760 """ 

761 Tracks which documents have been indexed for RAG. 

762 Row existence = document is indexed. No row = not indexed. 

763 Simple and avoids ORM caching issues. 

764 """ 

765 

766 __tablename__ = "rag_document_status" 

767 

768 # Composite primary key 

769 document_id = Column( 

770 String(36), 

771 ForeignKey("documents.id", ondelete="CASCADE"), 

772 primary_key=True, 

773 nullable=False, 

774 ) 

775 collection_id = Column( 

776 String(36), 

777 ForeignKey("collections.id", ondelete="CASCADE"), 

778 primary_key=True, 

779 nullable=False, 

780 ) 

781 

782 # Which RAG index was used (tracks embedding model indirectly) 

783 rag_index_id = Column( 

784 Integer, 

785 ForeignKey("rag_indices.id", ondelete="CASCADE"), 

786 nullable=False, 

787 index=True, 

788 ) 

789 

790 # Metadata 

791 chunk_count = Column(Integer, nullable=False) 

792 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow()) 

793 

794 # Indexes for fast lookups 

795 __table_args__ = ( 

796 Index("idx_rag_status_collection", "collection_id"), 

797 Index("idx_rag_status_index", "rag_index_id"), 

798 ) 

799 

800 def __repr__(self): 

801 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>" 

802 

803 

804class CollectionFolder(Base): 

805 """ 

806 Local folders linked to a collection for indexing. 

807 """ 

808 

809 __tablename__ = "collection_folders" 

810 

811 id = Column(Integer, primary_key=True, autoincrement=True) 

812 

813 # Collection association 

814 collection_id = Column( 

815 String(36), 

816 ForeignKey("collections.id", ondelete="CASCADE"), 

817 nullable=False, 

818 index=True, 

819 ) 

820 

821 # Folder configuration 

822 folder_path = Column(Text, nullable=False) # Absolute path to folder 

823 include_patterns = Column( 

824 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"] 

825 ) # File patterns to include 

826 exclude_patterns = Column( 

827 JSON 

828 ) # Patterns to exclude (e.g., ["**/node_modules/**"]) 

829 recursive = Column(Boolean, default=True) # Search subfolders 

830 

831 # Monitoring 

832 watch_enabled = Column( 

833 Boolean, default=False 

834 ) # Auto-reindex on changes (future) 

835 last_scanned_at = Column(UtcDateTime, nullable=True) 

836 file_count = Column(Integer, default=0) # Total files found 

837 indexed_file_count = Column(Integer, default=0) # Files indexed 

838 

839 # Timestamps 

840 created_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

841 updated_at = Column( 

842 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

843 ) 

844 

845 # Relationships 

846 collection = relationship("Collection", back_populates="linked_folders") 

847 files = relationship( 

848 "CollectionFolderFile", 

849 back_populates="folder", 

850 cascade="all, delete-orphan", 

851 ) 

852 

853 def __repr__(self): 

854 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>" 

855 

856 

857class CollectionFolderFile(Base): 

858 """ 

859 Files found in linked folders. 

860 Lightweight tracking for deduplication and indexing status. 

861 """ 

862 

863 __tablename__ = "collection_folder_files" 

864 

865 id = Column(Integer, primary_key=True, autoincrement=True) 

866 

867 # Folder association 

868 folder_id = Column( 

869 Integer, 

870 ForeignKey("collection_folders.id", ondelete="CASCADE"), 

871 nullable=False, 

872 index=True, 

873 ) 

874 

875 # File identification 

876 relative_path = Column(Text, nullable=False) # Path relative to folder_path 

877 file_hash = Column(String(64), index=True) # SHA256 for deduplication 

878 file_size = Column(Integer) # Size in bytes 

879 file_type = Column(String(50)) # Extension 

880 

881 # File metadata 

882 last_modified = Column(UtcDateTime) # File modification time 

883 

884 # Indexing status 

885 indexed = Column(Boolean, default=False) 

886 chunk_count = Column(Integer, default=0) 

887 last_indexed_at = Column(UtcDateTime, nullable=True) 

888 index_error = Column(Text, nullable=True) # Error if indexing failed 

889 

890 # Timestamps 

891 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

892 updated_at = Column( 

893 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False 

894 ) 

895 

896 # Relationships 

897 folder = relationship("CollectionFolder", back_populates="files") 

898 

899 # Ensure one entry per file in folder 

900 __table_args__ = ( 

901 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"), 

902 Index("idx_folder_indexed", "folder_id", "indexed"), 

903 ) 

904 

905 def __repr__(self): 

906 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"