Coverage for src/local_deep_research/database/models/library.py: 99%

1"""

2Library and document models - Unified architecture.

3All documents (research downloads and user uploads) are stored in one table.

4Collections organize documents, with "Library" as the default collection.

5"""

7import enum

9from sqlalchemy import (

10 JSON,

11 Boolean,

12 Column,

13 Date,

14 Enum,

15 ForeignKey,

16 Index,

17 Integer,

18 LargeBinary,

19 String,

20 Text,

21 UniqueConstraint,

22)

23from sqlalchemy.orm import backref, relationship

24from sqlalchemy_utc import UtcDateTime, utcnow

26from .base import Base

29class RAGIndexStatus(enum.Enum):

30 """Status values for RAG indices."""

32 ACTIVE = "active"

33 REBUILDING = "rebuilding"

34 DEPRECATED = "deprecated"

37class DocumentStatus(enum.Enum):

38 """Status values for document processing and downloads."""

40 PENDING = "pending"

41 PROCESSING = "processing"

42 COMPLETED = "completed"

43 FAILED = "failed"

46class EmbeddingProvider(enum.Enum):

47 """Embedding model provider types."""

49 SENTENCE_TRANSFORMERS = "sentence_transformers"

50 OLLAMA = "ollama"

53class ExtractionMethod(str, enum.Enum):

54 """Methods used to extract text from documents."""

56 PDF_EXTRACTION = "pdf_extraction"

57 NATIVE_API = "native_api"

58 UNKNOWN = "unknown"

61class ExtractionSource(str, enum.Enum):

62 """Sources used for text extraction."""

64 ARXIV_API = "arxiv_api"

65 PUBMED_API = "pubmed_api"

66 PDFPLUMBER = "pdfplumber"

67 PDFPLUMBER_FALLBACK = "pdfplumber_fallback"

68 LOCAL_PDF = "local_pdf"

69 LEGACY_FILE = "legacy_file"

72class ExtractionQuality(str, enum.Enum):

73 """Quality levels for extracted text."""

75 HIGH = "high"

76 MEDIUM = "medium"

77 LOW = "low"

80class DistanceMetric(str, enum.Enum):

81 """Distance metrics for vector similarity search."""

83 COSINE = "cosine"

84 L2 = "l2"

85 DOT_PRODUCT = "dot_product"

88class IndexType(str, enum.Enum):

89 """FAISS index types for RAG."""

91 FLAT = "flat"

92 HNSW = "hnsw"

93 IVF = "ivf"

96class SplitterType(str, enum.Enum):

97 """Text splitter types for chunking."""

99 RECURSIVE = "recursive"

100 SEMANTIC = "semantic"

101 TOKEN = "token"

102 SENTENCE = "sentence"

103

104

105class PDFStorageMode(str, enum.Enum):

106 """Storage modes for PDF files."""

107

108 NONE = "none" # Don't store PDFs, text-only

109 FILESYSTEM = "filesystem" # Store PDFs unencrypted on filesystem

110 DATABASE = "database" # Store PDFs encrypted in database

111

112

113class SourceType(Base):

114 """

115 Document source types (research_download, user_upload, manual_entry, etc.).

116 Normalized table for consistent categorization.

117 """

118

119 __tablename__ = "source_types"

120

121 id = Column(String(36), primary_key=True) # UUID

122 name = Column(String(50), nullable=False, unique=True, index=True)

123 display_name = Column(String(100), nullable=False)

124 description = Column(Text)

125 icon = Column(String(50)) # Icon name for UI

126

127 # Timestamps

128 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

129

130 def __repr__(self):

131 return (

132 f"<SourceType(name='{self.name}', display='{self.display_name}')>"

133 )

134

135

136class UploadBatch(Base):

137 """

138 Tracks batches of user-uploaded files.

139 Groups uploads for traceability and batch operations.

140 """

141

142 __tablename__ = "upload_batches"

143

144 id = Column(String(36), primary_key=True) # UUID

145 collection_id = Column(

146 String(36),

147 ForeignKey("collections.id", ondelete="CASCADE"),

148 nullable=False,

149 index=True,

150 )

151 uploaded_at = Column(UtcDateTime, default=utcnow(), nullable=False)

152 file_count = Column(Integer, default=0)

153 total_size = Column(Integer, default=0) # Total bytes

154

155 # Relationships

156 collection = relationship("Collection", backref="upload_batches")

157 documents = relationship("Document", backref="upload_batch")

158

159 def __repr__(self):

160 return f"<UploadBatch(id='{self.id}', files={self.file_count}, size={self.total_size})>"

161

162

163class Document(Base):

164 """

165 Unified document table for all documents (research downloads + user uploads).

166 """

167

168 __tablename__ = "documents"

169

170 id = Column(String(36), primary_key=True) # UUID as string

171

172 # Source type (research_download, user_upload, etc.)

173 source_type_id = Column(

174 String(36),

175 ForeignKey("source_types.id"),

176 nullable=False,

177 index=True,

178 )

179

180 # Link to original research resource (for research downloads) - nullable for uploads

181 resource_id = Column(

182 Integer,

183 ForeignKey("research_resources.id", ondelete="CASCADE"),

184 nullable=True,

185 index=True,

186 )

187

188 # Link to research (for research downloads) - nullable for uploads

189 research_id = Column(

190 String(36),

191 ForeignKey("research_history.id", ondelete="CASCADE"),

192 nullable=True,

193 index=True,

194 )

195

196 # Link to upload batch (for user uploads) - nullable for research downloads

197 upload_batch_id = Column(

198 String(36),

199 ForeignKey("upload_batches.id", ondelete="SET NULL"),

200 nullable=True,

201 index=True,

202 )

203

204 # Document identification

205 document_hash = Column(

206 String(64), nullable=False, unique=True, index=True

207 ) # SHA256 for deduplication

208 original_url = Column(Text, nullable=True) # Source URL (for downloads)

209 filename = Column(String(500), nullable=True) # Display name (for uploads)

210 original_filename = Column(

211 String(500), nullable=True

212 ) # Original upload name

213

214 # File information

215 file_path = Column(

216 Text, nullable=True

217 ) # Path relative to library/uploads root

218 file_size = Column(Integer, nullable=False) # Size in bytes

219 file_type = Column(String(50), nullable=False) # pdf, txt, md, html, etc.

220 mime_type = Column(String(100), nullable=True) # MIME type

221

222 # Content storage - text always stored in DB

223 text_content = Column(

224 Text, nullable=True

225 ) # Extracted/uploaded text content

226

227 # PDF storage mode (none, filesystem, database)

228 storage_mode = Column(

229 String(20), nullable=True, default="database"

230 ) # PDFStorageMode value

231

232 # Metadata

233 title = Column(Text) # Document title

234 description = Column(Text) # User description

235 authors = Column(JSON) # List of authors (for research papers)

236 published_date = Column(Date, nullable=True) # Publication date

237

238 # Academic identifiers (for research papers)

239 doi = Column(String(255), nullable=True, index=True)

240 arxiv_id = Column(String(100), nullable=True, index=True)

241 pmid = Column(String(50), nullable=True, index=True)

242 pmcid = Column(String(50), nullable=True, index=True)

243 isbn = Column(String(20), nullable=True)

244

245 # Download/Upload information

246 status = Column(

247 Enum(

248 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]

249 ),

250 nullable=False,

251 default=DocumentStatus.COMPLETED,

252 )

253 attempts = Column(Integer, default=1)

254 error_message = Column(Text, nullable=True)

255 processed_at = Column(UtcDateTime, nullable=False, default=utcnow())

256 last_accessed = Column(UtcDateTime, nullable=True)

257

258 # Text extraction metadata (for research downloads from PDFs)

259 extraction_method = Column(

260 String(50), nullable=True

261 ) # pdf_extraction, native_api, etc.

262 extraction_source = Column(

263 String(50), nullable=True

264 ) # arxiv_api, pdfplumber, etc.

265 extraction_quality = Column(String(20), nullable=True) # high, medium, low

266 has_formatting_issues = Column(Boolean, default=False)

267 has_encoding_issues = Column(Boolean, default=False)

268 character_count = Column(Integer, nullable=True)

269 word_count = Column(Integer, nullable=True)

270

271 # Organization

272 tags = Column(JSON) # User-defined tags

273 notes = Column(Text) # User notes

274 favorite = Column(Boolean, default=False)

275

276 # Timestamps

277 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

278 updated_at = Column(

279 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

280 )

281

282 # Relationships

283 source_type = relationship("SourceType", backref="documents")

284 resource = relationship("ResearchResource", backref="documents")

285 research = relationship("ResearchHistory", backref="documents")

286 collections = relationship(

287 "DocumentCollection",

288 back_populates="document",

289 cascade="all, delete-orphan",

290 )

291

292 # Indexes for efficient queries

293 __table_args__ = (

294 Index("idx_source_type", "source_type_id", "status"),

295 Index("idx_research_documents", "research_id", "status"),

296 Index("idx_document_type", "file_type", "status"),

297 Index("idx_document_hash", "document_hash"),

298 )

299

300 def __repr__(self):

301 title_str = (

302 self.title[:50]

303 if self.title

304 else (self.filename[:50] if self.filename else "Untitled")

305 )

306 return f"<Document(title='{title_str}', type={self.file_type}, size={self.file_size})>"

307

308

309class DocumentBlob(Base):

310 """

311 Separate table for storing PDF binary content.

312 SQLite best practices: keep BLOBs in separate table for better query performance.

313 Stored in encrypted SQLCipher database for security.

314 """

315

316 __tablename__ = "document_blobs"

317

318 # Primary key references Document.id

319 document_id = Column(

320 String(36),

321 ForeignKey("documents.id", ondelete="CASCADE"),

322 primary_key=True,

323 nullable=False,

324 )

325

326 # Binary PDF content

327 pdf_binary = Column(LargeBinary, nullable=False)

328

329 # Hash for integrity verification

330 blob_hash = Column(String(64), nullable=True, index=True) # SHA256

331

332 # Timestamps

333 stored_at = Column(UtcDateTime, default=utcnow(), nullable=False)

334 last_accessed = Column(UtcDateTime, nullable=True)

335

336 # Relationship

337 document = relationship(

338 "Document",

339 backref=backref("blob", passive_deletes=True),

340 passive_deletes=True,

341 )

342

343 def __repr__(self):

344 size = len(self.pdf_binary) if self.pdf_binary else 0

345 return f"<DocumentBlob(document_id='{self.document_id[:8]}...', size={size})>"

346

347

348class Collection(Base):

349 """

350 Collections for organizing documents.

351 'Library' is the default collection for research downloads.

352 Users can create custom collections for organization.

353 """

354

355 __tablename__ = "collections"

356

357 id = Column(String(36), primary_key=True) # UUID as string

358 name = Column(String(255), nullable=False)

359 description = Column(Text)

360

361 # Collection type (default_library, user_collection, linked_folder)

362 collection_type = Column(String(50), default="user_collection")

363

364 # Is this the default library collection?

365 is_default = Column(Boolean, default=False)

366

367 # Embedding model used for this collection (stored when first indexed)

368 embedding_model = Column(

369 String(100), nullable=True

370 ) # e.g., 'all-MiniLM-L6-v2', 'nomic-embed-text:latest'

371 embedding_model_type = Column(

372 Enum(

373 EmbeddingProvider,

374 values_callable=lambda obj: [e.value for e in obj],

375 ),

376 nullable=True,

377 )

378 embedding_dimension = Column(Integer, nullable=True) # Vector dimension

379 chunk_size = Column(Integer, nullable=True) # Chunk size used

380 chunk_overlap = Column(Integer, nullable=True) # Chunk overlap used

381

382 # Advanced embedding configuration options (Issue #1054)

383 splitter_type = Column(

384 String(50), nullable=True

385 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'

386 text_separators = Column(

387 JSON, nullable=True

388 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]

389 distance_metric = Column(

390 String(50), nullable=True

391 ) # Distance metric: 'cosine', 'l2', 'dot_product'

392 normalize_vectors = Column(

393 Boolean, nullable=True

394 ) # Whether to normalize embeddings with L2

395 index_type = Column(

396 String(50), nullable=True

397 ) # FAISS index type: 'flat', 'hnsw', 'ivf'

398

399 # Timestamps

400 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

401 updated_at = Column(

402 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

403 )

404

405 # Relationships

406 document_links = relationship(

407 "DocumentCollection",

408 back_populates="collection",

409 cascade="all, delete-orphan",

410 )

411 linked_folders = relationship(

412 "CollectionFolder",

413 back_populates="collection",

414 cascade="all, delete-orphan",

415 )

416

417 def __repr__(self):

418 return f"<Collection(id='{self.id}', name='{self.name}', type='{self.collection_type}')>"

419

420

421class DocumentCollection(Base):

422 """

423 Many-to-many relationship between documents and collections.

424 Tracks indexing status per collection (documents can be in multiple collections).

425 """

426

427 __tablename__ = "document_collections"

428

429 id = Column(Integer, primary_key=True, autoincrement=True)

430

431 # Foreign keys

432 document_id = Column(

433 String(36),

434 ForeignKey("documents.id", ondelete="CASCADE"),

435 nullable=False,

436 index=True,

437 )

438 collection_id = Column(

439 String(36),

440 ForeignKey("collections.id", ondelete="CASCADE"),

441 nullable=False,

442 index=True,

443 )

444

445 # Indexing status (per collection!)

446 indexed = Column(

447 Boolean, default=False

448 ) # Whether indexed for this collection

449 chunk_count = Column(

450 Integer, default=0

451 ) # Number of chunks in this collection

452 last_indexed_at = Column(UtcDateTime, nullable=True)

453

454 # Timestamps

455 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)

456

457 # Relationships

458 document = relationship("Document", back_populates="collections")

459 collection = relationship("Collection", back_populates="document_links")

460

461 # Ensure one entry per document-collection pair

462 __table_args__ = (

463 UniqueConstraint(

464 "document_id", "collection_id", name="uix_document_collection"

465 ),

466 Index("idx_collection_indexed", "collection_id", "indexed"),

467 )

468

469 def __repr__(self):

470 return f"<DocumentCollection(doc_id={self.document_id}, coll_id={self.collection_id}, indexed={self.indexed})>"

471

472

473class DocumentChunk(Base):

474 """

475 Universal chunk storage for RAG across all sources.

476 Stores text chunks in encrypted database for semantic search.

477 """

478

479 __tablename__ = "document_chunks"

480

481 id = Column(Integer, primary_key=True, autoincrement=True)

482

483 # Chunk identification

484 chunk_hash = Column(

485 String(64), nullable=False, index=True

486 ) # SHA256 for deduplication

487

488 # Source tracking - now points to unified Document table

489 source_type = Column(

490 String(20), nullable=False, index=True

491 ) # 'document', 'folder_file'

492 source_id = Column(

493 String(36), nullable=True, index=True

494 ) # Document.id (UUID as string)

495 source_path = Column(

496 Text, nullable=True

497 ) # File path if local collection source

498 collection_name = Column(

499 String(100), nullable=False, index=True

500 ) # collection_<uuid>

501

502 # Chunk content (encrypted in SQLCipher DB)

503 chunk_text = Column(Text, nullable=False) # The actual chunk text

504 chunk_index = Column(Integer, nullable=False) # Position in source document

505 start_char = Column(Integer, nullable=False) # Start character position

506 end_char = Column(Integer, nullable=False) # End character position

507 word_count = Column(Integer, nullable=False) # Number of words in chunk

508

509 # Embedding metadata

510 embedding_id = Column(

511 String(36), nullable=False, unique=True, index=True

512 ) # UUID for FAISS vector mapping

513 embedding_model = Column(

514 String(100), nullable=False

515 ) # e.g., 'all-MiniLM-L6-v2'

516 embedding_model_type = Column(

517 Enum(

518 EmbeddingProvider,

519 values_callable=lambda obj: [e.value for e in obj],

520 ),

521 nullable=False,

522 )

523 embedding_dimension = Column(Integer, nullable=True) # Vector dimension

524

525 # Document metadata (for context)

526 document_title = Column(Text, nullable=True) # Title of source document

527 document_metadata = Column(

528 JSON, nullable=True

529 ) # Additional metadata from source

530

531 # Timestamps

532 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

533 last_accessed = Column(UtcDateTime, nullable=True)

534

535 # Indexes for efficient queries

536 __table_args__ = (

537 UniqueConstraint(

538 "chunk_hash", "collection_name", name="uix_chunk_collection"

539 ),

540 Index("idx_chunk_source", "source_type", "source_id"),

541 Index("idx_chunk_collection", "collection_name", "created_at"),

542 Index("idx_chunk_embedding", "embedding_id"),

543 )

544

545 def __repr__(self):

546 return f"<DocumentChunk(collection='{self.collection_name}', source_type='{self.source_type}', index={self.chunk_index}, words={self.word_count})>"

547

548

549class DownloadQueue(Base):

550 """

551 Queue for pending document downloads.

552 Renamed from LibraryDownloadQueue for consistency.

553 """

554

555 __tablename__ = "download_queue"

556

557 id = Column(Integer, primary_key=True, autoincrement=True)

558

559 # What to download

560 resource_id = Column(

561 Integer,

562 ForeignKey("research_resources.id", ondelete="CASCADE"),

563 nullable=False,

564 unique=True, # One queue entry per resource

565 )

566 research_id = Column(String(36), nullable=False, index=True)

567

568 # Target collection (defaults to Library collection)

569 collection_id = Column(

570 String(36),

571 ForeignKey("collections.id", ondelete="SET NULL"),

572 nullable=True,

573 index=True,

574 )

575

576 # Queue management

577 priority = Column(Integer, default=0) # Higher = more important

578 status = Column(

579 Enum(

580 DocumentStatus, values_callable=lambda obj: [e.value for e in obj]

581 ),

582 nullable=False,

583 default=DocumentStatus.PENDING,

584 )

585 attempts = Column(Integer, default=0)

586 max_attempts = Column(Integer, default=3)

587

588 # Error tracking

589 last_error = Column(Text, nullable=True)

590 last_attempt_at = Column(UtcDateTime, nullable=True)

591

592 # Timestamps

593 queued_at = Column(UtcDateTime, default=utcnow(), nullable=False)

594 completed_at = Column(UtcDateTime, nullable=True)

595

596 # Relationships

597 resource = relationship("ResearchResource", backref="download_queue")

598 collection = relationship("Collection", backref="download_queue_items")

599

600 def __repr__(self):

601 return f"<DownloadQueue(resource_id={self.resource_id}, status={self.status}, attempts={self.attempts})>"

602

603

604class LibraryStatistics(Base):

605 """

606 Aggregate statistics for the library.

607 Updated periodically for dashboard display.

608 """

609

610 __tablename__ = "library_statistics"

611

612 id = Column(Integer, primary_key=True, autoincrement=True)

613

614 # Document counts

615 total_documents = Column(Integer, default=0)

616 total_pdfs = Column(Integer, default=0)

617 total_html = Column(Integer, default=0)

618 total_other = Column(Integer, default=0)

619

620 # Storage metrics

621 total_size_bytes = Column(Integer, default=0)

622 average_document_size = Column(Integer, default=0)

623

624 # Research metrics

625 total_researches_with_downloads = Column(Integer, default=0)

626 average_documents_per_research = Column(Integer, default=0)

627

628 # Download metrics

629 total_download_attempts = Column(Integer, default=0)

630 successful_downloads = Column(Integer, default=0)

631 failed_downloads = Column(Integer, default=0)

632 pending_downloads = Column(Integer, default=0)

633

634 # Academic sources breakdown

635 arxiv_count = Column(Integer, default=0)

636 pubmed_count = Column(Integer, default=0)

637 doi_count = Column(Integer, default=0)

638 other_count = Column(Integer, default=0)

639

640 # Timestamps

641 calculated_at = Column(UtcDateTime, default=utcnow(), nullable=False)

642

643 def __repr__(self):

644 return f"<LibraryStatistics(documents={self.total_documents}, size={self.total_size_bytes})>"

645

646

647class RAGIndex(Base):

648 """

649 Tracks FAISS indices for RAG collections.

650 Each collection+embedding_model combination has its own FAISS index.

651 """

652

653 __tablename__ = "rag_indices"

654

655 id = Column(Integer, primary_key=True, autoincrement=True)

656

657 # Collection and model identification

658 collection_name = Column(

659 String(100), nullable=False, index=True

660 ) # 'collection_<uuid>'

661 embedding_model = Column(

662 String(100), nullable=False

663 ) # e.g., 'all-MiniLM-L6-v2'

664 embedding_model_type = Column(

665 Enum(

666 EmbeddingProvider,

667 values_callable=lambda obj: [e.value for e in obj],

668 ),

669 nullable=False,

670 )

671 embedding_dimension = Column(Integer, nullable=False) # Vector dimension

672

673 # Index file location

674 index_path = Column(Text, nullable=False) # Path to .faiss file

675 index_hash = Column(

676 String(64), nullable=False, unique=True, index=True

677 ) # SHA256 of collection+model for uniqueness

678

679 # Chunking parameters used

680 chunk_size = Column(Integer, nullable=False)

681 chunk_overlap = Column(Integer, nullable=False)

682

683 # Advanced embedding configuration options (Issue #1054)

684 splitter_type = Column(

685 String(50), nullable=True

686 ) # Splitter type: 'recursive', 'semantic', 'token', 'sentence'

687 text_separators = Column(

688 JSON, nullable=True

689 ) # Text separators for chunking, e.g., ["\n\n", "\n", ". ", " ", ""]

690 distance_metric = Column(

691 String(50), nullable=True

692 ) # Distance metric: 'cosine', 'l2', 'dot_product'

693 normalize_vectors = Column(

694 Boolean, nullable=True

695 ) # Whether to normalize embeddings with L2

696 index_type = Column(

697 String(50), nullable=True

698 ) # FAISS index type: 'flat', 'hnsw', 'ivf'

699

700 # Index statistics

701 chunk_count = Column(Integer, default=0) # Number of chunks in this index

702 total_documents = Column(Integer, default=0) # Number of source documents

703

704 # Status

705 status = Column(

706 Enum(

707 RAGIndexStatus, values_callable=lambda obj: [e.value for e in obj]

708 ),

709 nullable=False,

710 default=RAGIndexStatus.ACTIVE,

711 )

712 is_current = Column(

713 Boolean, default=True

714 ) # Whether this is the current index for this collection

715

716 # Timestamps

717 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

718 last_updated_at = Column(

719 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

720 )

721 last_used_at = Column(

722 UtcDateTime, nullable=True

723 ) # Last time index was searched

724

725 # Ensure one active index per collection+model

726 __table_args__ = (

727 UniqueConstraint(

728 "collection_name",

729 "embedding_model",

730 "embedding_model_type",

731 name="uix_collection_model",

732 ),

733 Index("idx_collection_current", "collection_name", "is_current"),

734 )

735

736 def __repr__(self):

737 return f"<RAGIndex(collection='{self.collection_name}', model='{self.embedding_model}', chunks={self.chunk_count})>"

738

739

740class RagDocumentStatus(Base):

741 """

742 Tracks which documents have been indexed for RAG.

743 Row existence = document is indexed. No row = not indexed.

744 Simple and avoids ORM caching issues.

745 """

746

747 __tablename__ = "rag_document_status"

748

749 # Composite primary key

750 document_id = Column(

751 String(36),

752 ForeignKey("documents.id", ondelete="CASCADE"),

753 primary_key=True,

754 nullable=False,

755 )

756 collection_id = Column(

757 String(36),

758 ForeignKey("collections.id", ondelete="CASCADE"),

759 primary_key=True,

760 nullable=False,

761 )

762

763 # Which RAG index was used (tracks embedding model indirectly)

764 rag_index_id = Column(

765 Integer,

766 ForeignKey("rag_indices.id", ondelete="CASCADE"),

767 nullable=False,

768 index=True,

769 )

770

771 # Metadata

772 chunk_count = Column(Integer, nullable=False)

773 indexed_at = Column(UtcDateTime, nullable=False, default=utcnow())

774

775 # Indexes for fast lookups

776 __table_args__ = (

777 Index("idx_rag_status_collection", "collection_id"),

778 Index("idx_rag_status_index", "rag_index_id"),

779 )

780

781 def __repr__(self):

782 return f"<RagDocumentStatus(doc='{self.document_id[:8]}...', coll='{self.collection_id[:8]}...', chunks={self.chunk_count})>"

783

784

785class CollectionFolder(Base):

786 """

787 Local folders linked to a collection for indexing.

788 """

789

790 __tablename__ = "collection_folders"

791

792 id = Column(Integer, primary_key=True, autoincrement=True)

793

794 # Collection association

795 collection_id = Column(

796 String(36),

797 ForeignKey("collections.id", ondelete="CASCADE"),

798 nullable=False,

799 index=True,

800 )

801

802 # Folder configuration

803 folder_path = Column(Text, nullable=False) # Absolute path to folder

804 include_patterns = Column(

805 JSON, default=["*.pdf", "*.txt", "*.md", "*.html"]

806 ) # File patterns to include

807 exclude_patterns = Column(

808 JSON

809 ) # Patterns to exclude (e.g., ["**/node_modules/**"])

810 recursive = Column(Boolean, default=True) # Search subfolders

811

812 # Monitoring

813 watch_enabled = Column(

814 Boolean, default=False

815 ) # Auto-reindex on changes (future)

816 last_scanned_at = Column(UtcDateTime, nullable=True)

817 file_count = Column(Integer, default=0) # Total files found

818 indexed_file_count = Column(Integer, default=0) # Files indexed

819

820 # Timestamps

821 created_at = Column(UtcDateTime, default=utcnow(), nullable=False)

822 updated_at = Column(

823 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

824 )

825

826 # Relationships

827 collection = relationship("Collection", back_populates="linked_folders")

828 files = relationship(

829 "CollectionFolderFile",

830 back_populates="folder",

831 cascade="all, delete-orphan",

832 )

833

834 def __repr__(self):

835 return f"<CollectionFolder(path='{self.folder_path}', files={self.file_count})>"

836

837

838class CollectionFolderFile(Base):

839 """

840 Files found in linked folders.

841 Lightweight tracking for deduplication and indexing status.

842 """

843

844 __tablename__ = "collection_folder_files"

845

846 id = Column(Integer, primary_key=True, autoincrement=True)

847

848 # Folder association

849 folder_id = Column(

850 Integer,

851 ForeignKey("collection_folders.id", ondelete="CASCADE"),

852 nullable=False,

853 index=True,

854 )

855

856 # File identification

857 relative_path = Column(Text, nullable=False) # Path relative to folder_path

858 file_hash = Column(String(64), index=True) # SHA256 for deduplication

859 file_size = Column(Integer) # Size in bytes

860 file_type = Column(String(50)) # Extension

861

862 # File metadata

863 last_modified = Column(UtcDateTime) # File modification time

864

865 # Indexing status

866 indexed = Column(Boolean, default=False)

867 chunk_count = Column(Integer, default=0)

868 last_indexed_at = Column(UtcDateTime, nullable=True)

869 index_error = Column(Text, nullable=True) # Error if indexing failed

870

871 # Timestamps

872 discovered_at = Column(UtcDateTime, default=utcnow(), nullable=False)

873 updated_at = Column(

874 UtcDateTime, default=utcnow(), onupdate=utcnow(), nullable=False

875 )

876

877 # Relationships

878 folder = relationship("CollectionFolder", back_populates="files")

879

880 # Ensure one entry per file in folder

881 __table_args__ = (

882 UniqueConstraint("folder_id", "relative_path", name="uix_folder_file"),

883 Index("idx_folder_indexed", "folder_id", "indexed"),

884 )

885

886 def __repr__(self):

887 return f"<CollectionFolderFile(path='{self.relative_path}', indexed={self.indexed})>"

Coverage for src / local_deep_research / database / models / library.py: 99%

306 statements