Coverage for src / local_deep_research / database / models / download_tracker.py: 89%
47 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Download tracking models for deduplication and efficient checking.
3Separate from library models to keep tracking lightweight.
4"""
6from sqlalchemy import (
7 Boolean,
8 Column,
9 ForeignKey,
10 Index,
11 Integer,
12 String,
13 Text,
14 UniqueConstraint,
15)
16from sqlalchemy_utc import UtcDateTime, utcnow
18from .base import Base
21class DownloadTracker(Base):
22 """
23 Lightweight table to track which URLs have been downloaded.
24 Used for quick deduplication checks before attempting downloads.
25 """
27 __tablename__ = "download_tracker"
29 id = Column(Integer, primary_key=True, autoincrement=True)
31 # URL tracking
32 url = Column(Text, nullable=False) # Original URL
33 url_hash = Column(
34 String(64), nullable=False, unique=True, index=True
35 ) # SHA256 of normalized URL
37 # Resource tracking (can be multiple resources with same URL)
38 first_resource_id = Column(
39 Integer, ForeignKey("research_resources.id"), nullable=False
40 )
42 # File tracking
43 file_hash = Column(
44 String(64), nullable=True, index=True
45 ) # SHA256 of downloaded content
46 file_path = Column(
47 Text, nullable=True
48 ) # Relative path from library root (e.g., "2024/12/arxiv_2401_12345.pdf")
49 # NOTE: Absolute path removed - will be computed at runtime from library root + relative path
50 file_name = Column(
51 String(255), nullable=True, index=True
52 ) # Just the filename for searching
53 file_size = Column(Integer, nullable=True)
55 # Status
56 is_downloaded = Column(Boolean, default=False, nullable=False, index=True)
57 is_accessible = Column(Boolean, default=True) # False if 404, 403, etc.
59 # Timestamps
60 first_seen = Column(UtcDateTime, default=utcnow(), nullable=False)
61 downloaded_at = Column(UtcDateTime, nullable=True)
62 last_checked = Column(UtcDateTime, default=utcnow(), nullable=False)
64 # Link to full document record if it exists
65 library_document_id = Column(
66 Integer,
67 ForeignKey("documents.id", ondelete="SET NULL"),
68 nullable=True,
69 )
71 def __repr__(self):
72 status = "downloaded" if self.is_downloaded else "not downloaded"
73 return f"<DownloadTracker(url_hash={self.url_hash[:8]}..., status={status})>"
76class DownloadDuplicates(Base):
77 """
78 Track duplicate URLs across different resources.
79 Helps identify when multiple researches reference the same source.
80 """
82 __tablename__ = "download_duplicates"
84 id = Column(Integer, primary_key=True, autoincrement=True)
85 url_hash = Column(
86 String(64),
87 ForeignKey("download_tracker.url_hash"),
88 nullable=False,
89 index=True,
90 )
91 resource_id = Column(
92 Integer, ForeignKey("research_resources.id"), nullable=False
93 )
94 research_id = Column(String(36), nullable=False, index=True)
96 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)
98 __table_args__ = (
99 UniqueConstraint("url_hash", "resource_id", name="uix_url_resource"),
100 Index("idx_research_duplicates", "research_id", "url_hash"),
101 )
103 def __repr__(self):
104 return f"<DownloadDuplicates(url_hash={self.url_hash[:8]}..., resource_id={self.resource_id})>"
107class DownloadAttempt(Base):
108 """
109 Log of download attempts for debugging and retry logic.
110 """
112 __tablename__ = "download_attempts"
114 id = Column(Integer, primary_key=True, autoincrement=True)
115 url_hash = Column(
116 String(64),
117 ForeignKey("download_tracker.url_hash"),
118 nullable=False,
119 index=True,
120 )
122 # Attempt details
123 attempt_number = Column(Integer, nullable=False)
124 status_code = Column(Integer, nullable=True) # HTTP status code
125 error_type = Column(String(100), nullable=True) # timeout, connection, etc.
126 error_message = Column(Text, nullable=True)
128 # Timing
129 attempted_at = Column(UtcDateTime, default=utcnow(), nullable=False)
130 duration_ms = Column(Integer, nullable=True)
132 # Success tracking
133 succeeded = Column(Boolean, default=False, nullable=False)
134 bytes_downloaded = Column(Integer, nullable=True)
136 def __repr__(self):
137 status = (
138 "success"
139 if self.succeeded
140 else f"failed ({self.status_code or self.error_type})"
141 )
142 return (
143 f"<DownloadAttempt(attempt={self.attempt_number}, status={status})>"
144 )