Coverage for src/local_deep_research/database/models/download_tracker.py: 100%

48 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Download tracking models for deduplication and efficient checking. 

3Separate from library models to keep tracking lightweight. 

4""" 

5 

6from sqlalchemy import ( 

7 Boolean, 

8 Column, 

9 ForeignKey, 

10 Index, 

11 Integer, 

12 String, 

13 Text, 

14 UniqueConstraint, 

15) 

16from sqlalchemy_utc import UtcDateTime, utcnow 

17 

18from .base import Base 

19 

20 

21class DownloadTracker(Base): 

22 """ 

23 Lightweight table to track which URLs have been downloaded. 

24 Used for quick deduplication checks before attempting downloads. 

25 """ 

26 

27 __tablename__ = "download_tracker" 

28 

29 id = Column(Integer, primary_key=True, autoincrement=True) 

30 

31 # URL tracking 

32 url = Column(Text, nullable=False) # Original URL 

33 url_hash = Column( 

34 String(64), nullable=False 

35 ) # SHA256 of normalized URL — UNIQUE backing comes from __table_args__ 

36 

37 # Resource tracking (can be multiple resources with same URL) 

38 first_resource_id = Column( 

39 Integer, ForeignKey("research_resources.id"), nullable=False 

40 ) 

41 

42 # File tracking 

43 file_hash = Column( 

44 String(64), nullable=True, index=True 

45 ) # SHA256 of downloaded content 

46 file_path = Column( 

47 Text, nullable=True 

48 ) # Relative path from library root (e.g., "2024/12/arxiv_2401_12345.pdf") 

49 # NOTE: Absolute path removed - will be computed at runtime from library root + relative path 

50 file_name = Column( 

51 String(255), nullable=True, index=True 

52 ) # Just the filename for searching 

53 file_size = Column(Integer, nullable=True) 

54 

55 # Status 

56 is_downloaded = Column(Boolean, default=False, nullable=False, index=True) 

57 is_accessible = Column(Boolean, default=True) # False if 404, 403, etc. 

58 

59 # Timestamps 

60 first_seen = Column(UtcDateTime, default=utcnow(), nullable=False) 

61 downloaded_at = Column(UtcDateTime, nullable=True) 

62 last_checked = Column(UtcDateTime, default=utcnow(), nullable=False) 

63 

64 # Link to full document record if it exists. documents.id is String(36) UUID; 

65 # the previous Integer declaration produced an FK type mismatch that SQLite 

66 # silently coerced as TEXT due to type-affinity rules. 

67 library_document_id = Column( 

68 String(36), 

69 ForeignKey("documents.id", ondelete="SET NULL"), 

70 nullable=True, 

71 ) 

72 

73 # Inline UNIQUE so it lands in CREATE TABLE; column-level unique=True 

74 # produces a separate CREATE UNIQUE INDEX which SQLCipher does not accept 

75 # as an FK target for url_hash references. 

76 __table_args__ = ( 

77 UniqueConstraint("url_hash", name="uq_download_tracker_url_hash"), 

78 ) 

79 

80 def __repr__(self): 

81 status = "downloaded" if self.is_downloaded else "not downloaded" 

82 return f"<DownloadTracker(url_hash={self.url_hash[:8]}..., status={status})>" 

83 

84 

85class DownloadDuplicates(Base): 

86 """ 

87 Track duplicate URLs across different resources. 

88 Helps identify when multiple researches reference the same source. 

89 """ 

90 

91 __tablename__ = "download_duplicates" 

92 

93 id = Column(Integer, primary_key=True, autoincrement=True) 

94 url_hash = Column( 

95 String(64), 

96 ForeignKey("download_tracker.url_hash"), 

97 nullable=False, 

98 index=True, 

99 ) 

100 resource_id = Column( 

101 Integer, ForeignKey("research_resources.id"), nullable=False 

102 ) 

103 research_id = Column(String(36), nullable=False, index=True) 

104 

105 added_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

106 

107 __table_args__ = ( 

108 UniqueConstraint("url_hash", "resource_id", name="uix_url_resource"), 

109 Index("idx_research_duplicates", "research_id", "url_hash"), 

110 ) 

111 

112 def __repr__(self): 

113 return f"<DownloadDuplicates(url_hash={self.url_hash[:8]}..., resource_id={self.resource_id})>" 

114 

115 

116class DownloadAttempt(Base): 

117 """ 

118 Log of download attempts for debugging and retry logic. 

119 """ 

120 

121 __tablename__ = "download_attempts" 

122 

123 id = Column(Integer, primary_key=True, autoincrement=True) 

124 url_hash = Column( 

125 String(64), 

126 ForeignKey("download_tracker.url_hash"), 

127 nullable=False, 

128 index=True, 

129 ) 

130 

131 # Attempt details 

132 attempt_number = Column(Integer, nullable=False) 

133 status_code = Column(Integer, nullable=True) # HTTP status code 

134 error_type = Column(String(100), nullable=True) # timeout, connection, etc. 

135 error_message = Column(Text, nullable=True) 

136 

137 # Timing 

138 attempted_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

139 duration_ms = Column(Integer, nullable=True) 

140 

141 # Success tracking 

142 succeeded = Column(Boolean, default=False, nullable=False) 

143 bytes_downloaded = Column(Integer, nullable=True) 

144 

145 def __repr__(self): 

146 status = ( 

147 "success" 

148 if self.succeeded 

149 else f"failed ({self.status_code or self.error_type})" 

150 ) 

151 return ( 

152 f"<DownloadAttempt(attempt={self.attempt_number}, status={status})>" 

153 )