Coverage for src / local_deep_research / database / models / download_tracker.py: 89%

47 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Download tracking models for deduplication and efficient checking. 

3Separate from library models to keep tracking lightweight. 

4""" 

5 

6from sqlalchemy import ( 

7 Boolean, 

8 Column, 

9 ForeignKey, 

10 Index, 

11 Integer, 

12 String, 

13 Text, 

14 UniqueConstraint, 

15) 

16from sqlalchemy_utc import UtcDateTime, utcnow 

17 

18from .base import Base 

19 

20 

21class DownloadTracker(Base): 

22 """ 

23 Lightweight table to track which URLs have been downloaded. 

24 Used for quick deduplication checks before attempting downloads. 

25 """ 

26 

27 __tablename__ = "download_tracker" 

28 

29 id = Column(Integer, primary_key=True, autoincrement=True) 

30 

31 # URL tracking 

32 url = Column(Text, nullable=False) # Original URL 

33 url_hash = Column( 

34 String(64), nullable=False, unique=True, index=True 

35 ) # SHA256 of normalized URL 

36 

37 # Resource tracking (can be multiple resources with same URL) 

38 first_resource_id = Column( 

39 Integer, ForeignKey("research_resources.id"), nullable=False 

40 ) 

41 

42 # File tracking 

43 file_hash = Column( 

44 String(64), nullable=True, index=True 

45 ) # SHA256 of downloaded content 

46 file_path = Column( 

47 Text, nullable=True 

48 ) # Relative path from library root (e.g., "2024/12/arxiv_2401_12345.pdf") 

49 # NOTE: Absolute path removed - will be computed at runtime from library root + relative path 

50 file_name = Column( 

51 String(255), nullable=True, index=True 

52 ) # Just the filename for searching 

53 file_size = Column(Integer, nullable=True) 

54 

55 # Status 

56 is_downloaded = Column(Boolean, default=False, nullable=False, index=True) 

57 is_accessible = Column(Boolean, default=True) # False if 404, 403, etc. 

58 

59 # Timestamps 

60 first_seen = Column(UtcDateTime, default=utcnow(), nullable=False) 

61 downloaded_at = Column(UtcDateTime, nullable=True) 

62 last_checked = Column(UtcDateTime, default=utcnow(), nullable=False) 

63 

64 # Link to full document record if it exists 

65 library_document_id = Column( 

66 Integer, 

67 ForeignKey("documents.id", ondelete="SET NULL"), 

68 nullable=True, 

69 ) 

70 

71 def __repr__(self): 

72 status = "downloaded" if self.is_downloaded else "not downloaded" 

73 return f"<DownloadTracker(url_hash={self.url_hash[:8]}..., status={status})>" 

74 

75 

76class DownloadDuplicates(Base): 

77 """ 

78 Track duplicate URLs across different resources. 

79 Helps identify when multiple researches reference the same source. 

80 """ 

81 

82 __tablename__ = "download_duplicates" 

83 

84 id = Column(Integer, primary_key=True, autoincrement=True) 

85 url_hash = Column( 

86 String(64), 

87 ForeignKey("download_tracker.url_hash"), 

88 nullable=False, 

89 index=True, 

90 ) 

91 resource_id = Column( 

92 Integer, ForeignKey("research_resources.id"), nullable=False 

93 ) 

94 research_id = Column(String(36), nullable=False, index=True) 

95 

96 added_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

97 

98 __table_args__ = ( 

99 UniqueConstraint("url_hash", "resource_id", name="uix_url_resource"), 

100 Index("idx_research_duplicates", "research_id", "url_hash"), 

101 ) 

102 

103 def __repr__(self): 

104 return f"<DownloadDuplicates(url_hash={self.url_hash[:8]}..., resource_id={self.resource_id})>" 

105 

106 

107class DownloadAttempt(Base): 

108 """ 

109 Log of download attempts for debugging and retry logic. 

110 """ 

111 

112 __tablename__ = "download_attempts" 

113 

114 id = Column(Integer, primary_key=True, autoincrement=True) 

115 url_hash = Column( 

116 String(64), 

117 ForeignKey("download_tracker.url_hash"), 

118 nullable=False, 

119 index=True, 

120 ) 

121 

122 # Attempt details 

123 attempt_number = Column(Integer, nullable=False) 

124 status_code = Column(Integer, nullable=True) # HTTP status code 

125 error_type = Column(String(100), nullable=True) # timeout, connection, etc. 

126 error_message = Column(Text, nullable=True) 

127 

128 # Timing 

129 attempted_at = Column(UtcDateTime, default=utcnow(), nullable=False) 

130 duration_ms = Column(Integer, nullable=True) 

131 

132 # Success tracking 

133 succeeded = Column(Boolean, default=False, nullable=False) 

134 bytes_downloaded = Column(Integer, nullable=True) 

135 

136 def __repr__(self): 

137 status = ( 

138 "success" 

139 if self.succeeded 

140 else f"failed ({self.status_code or self.error_type})" 

141 ) 

142 return ( 

143 f"<DownloadAttempt(attempt={self.attempt_number}, status={status})>" 

144 )