Coverage for src/local_deep_research/database/models/download_tracker.py: 100%
48 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Download tracking models for deduplication and efficient checking.
3Separate from library models to keep tracking lightweight.
4"""
6from sqlalchemy import (
7 Boolean,
8 Column,
9 ForeignKey,
10 Index,
11 Integer,
12 String,
13 Text,
14 UniqueConstraint,
15)
16from sqlalchemy_utc import UtcDateTime, utcnow
18from .base import Base
21class DownloadTracker(Base):
22 """
23 Lightweight table to track which URLs have been downloaded.
24 Used for quick deduplication checks before attempting downloads.
25 """
27 __tablename__ = "download_tracker"
29 id = Column(Integer, primary_key=True, autoincrement=True)
31 # URL tracking
32 url = Column(Text, nullable=False) # Original URL
33 url_hash = Column(
34 String(64), nullable=False
35 ) # SHA256 of normalized URL — UNIQUE backing comes from __table_args__
37 # Resource tracking (can be multiple resources with same URL)
38 first_resource_id = Column(
39 Integer, ForeignKey("research_resources.id"), nullable=False
40 )
42 # File tracking
43 file_hash = Column(
44 String(64), nullable=True, index=True
45 ) # SHA256 of downloaded content
46 file_path = Column(
47 Text, nullable=True
48 ) # Relative path from library root (e.g., "2024/12/arxiv_2401_12345.pdf")
49 # NOTE: Absolute path removed - will be computed at runtime from library root + relative path
50 file_name = Column(
51 String(255), nullable=True, index=True
52 ) # Just the filename for searching
53 file_size = Column(Integer, nullable=True)
55 # Status
56 is_downloaded = Column(Boolean, default=False, nullable=False, index=True)
57 is_accessible = Column(Boolean, default=True) # False if 404, 403, etc.
59 # Timestamps
60 first_seen = Column(UtcDateTime, default=utcnow(), nullable=False)
61 downloaded_at = Column(UtcDateTime, nullable=True)
62 last_checked = Column(UtcDateTime, default=utcnow(), nullable=False)
64 # Link to full document record if it exists. documents.id is String(36) UUID;
65 # the previous Integer declaration produced an FK type mismatch that SQLite
66 # silently coerced as TEXT due to type-affinity rules.
67 library_document_id = Column(
68 String(36),
69 ForeignKey("documents.id", ondelete="SET NULL"),
70 nullable=True,
71 )
73 # Inline UNIQUE so it lands in CREATE TABLE; column-level unique=True
74 # produces a separate CREATE UNIQUE INDEX which SQLCipher does not accept
75 # as an FK target for url_hash references.
76 __table_args__ = (
77 UniqueConstraint("url_hash", name="uq_download_tracker_url_hash"),
78 )
80 def __repr__(self):
81 status = "downloaded" if self.is_downloaded else "not downloaded"
82 return f"<DownloadTracker(url_hash={self.url_hash[:8]}..., status={status})>"
85class DownloadDuplicates(Base):
86 """
87 Track duplicate URLs across different resources.
88 Helps identify when multiple researches reference the same source.
89 """
91 __tablename__ = "download_duplicates"
93 id = Column(Integer, primary_key=True, autoincrement=True)
94 url_hash = Column(
95 String(64),
96 ForeignKey("download_tracker.url_hash"),
97 nullable=False,
98 index=True,
99 )
100 resource_id = Column(
101 Integer, ForeignKey("research_resources.id"), nullable=False
102 )
103 research_id = Column(String(36), nullable=False, index=True)
105 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)
107 __table_args__ = (
108 UniqueConstraint("url_hash", "resource_id", name="uix_url_resource"),
109 Index("idx_research_duplicates", "research_id", "url_hash"),
110 )
112 def __repr__(self):
113 return f"<DownloadDuplicates(url_hash={self.url_hash[:8]}..., resource_id={self.resource_id})>"
116class DownloadAttempt(Base):
117 """
118 Log of download attempts for debugging and retry logic.
119 """
121 __tablename__ = "download_attempts"
123 id = Column(Integer, primary_key=True, autoincrement=True)
124 url_hash = Column(
125 String(64),
126 ForeignKey("download_tracker.url_hash"),
127 nullable=False,
128 index=True,
129 )
131 # Attempt details
132 attempt_number = Column(Integer, nullable=False)
133 status_code = Column(Integer, nullable=True) # HTTP status code
134 error_type = Column(String(100), nullable=True) # timeout, connection, etc.
135 error_message = Column(Text, nullable=True)
137 # Timing
138 attempted_at = Column(UtcDateTime, default=utcnow(), nullable=False)
139 duration_ms = Column(Integer, nullable=True)
141 # Success tracking
142 succeeded = Column(Boolean, default=False, nullable=False)
143 bytes_downloaded = Column(Integer, nullable=True)
145 def __repr__(self):
146 status = (
147 "success"
148 if self.succeeded
149 else f"failed ({self.status_code or self.error_type})"
150 )
151 return (
152 f"<DownloadAttempt(attempt={self.attempt_number}, status={status})>"
153 )