Coverage for src/local_deep_research/database/models/download

1"""

2Download tracking models for deduplication and efficient checking.

3Separate from library models to keep tracking lightweight.

4"""

6from sqlalchemy import (

7 Boolean,

8 Column,

9 ForeignKey,

10 Index,

11 Integer,

12 String,

13 Text,

14 UniqueConstraint,

15)

16from sqlalchemy_utc import UtcDateTime, utcnow

18from .base import Base

21class DownloadTracker(Base):

22 """

23 Lightweight table to track which URLs have been downloaded.

24 Used for quick deduplication checks before attempting downloads.

25 """

27 __tablename__ = "download_tracker"

29 id = Column(Integer, primary_key=True, autoincrement=True)

31 # URL tracking

32 url = Column(Text, nullable=False) # Original URL

33 url_hash = Column(

34 String(64), nullable=False, unique=True, index=True

35 ) # SHA256 of normalized URL

37 # Resource tracking (can be multiple resources with same URL)

38 first_resource_id = Column(

39 Integer, ForeignKey("research_resources.id"), nullable=False

40 )

42 # File tracking

43 file_hash = Column(

44 String(64), nullable=True, index=True

45 ) # SHA256 of downloaded content

46 file_path = Column(

47 Text, nullable=True

48 ) # Relative path from library root (e.g., "2024/12/arxiv_2401_12345.pdf")

49 # NOTE: Absolute path removed - will be computed at runtime from library root + relative path

50 file_name = Column(

51 String(255), nullable=True, index=True

52 ) # Just the filename for searching

53 file_size = Column(Integer, nullable=True)

55 # Status

56 is_downloaded = Column(Boolean, default=False, nullable=False, index=True)

57 is_accessible = Column(Boolean, default=True) # False if 404, 403, etc.

59 # Timestamps

60 first_seen = Column(UtcDateTime, default=utcnow(), nullable=False)

61 downloaded_at = Column(UtcDateTime, nullable=True)

62 last_checked = Column(UtcDateTime, default=utcnow(), nullable=False)

64 # Link to full document record if it exists

65 library_document_id = Column(

66 Integer,

67 ForeignKey("documents.id", ondelete="SET NULL"),

68 nullable=True,

69 )

71 def __repr__(self):

72 status = "downloaded" if self.is_downloaded else "not downloaded"

73 return f"<DownloadTracker(url_hash={self.url_hash[:8]}..., status={status})>"

76class DownloadDuplicates(Base):

77 """

78 Track duplicate URLs across different resources.

79 Helps identify when multiple researches reference the same source.

80 """

82 __tablename__ = "download_duplicates"

84 id = Column(Integer, primary_key=True, autoincrement=True)

85 url_hash = Column(

86 String(64),

87 ForeignKey("download_tracker.url_hash"),

88 nullable=False,

89 index=True,

90 )

91 resource_id = Column(

92 Integer, ForeignKey("research_resources.id"), nullable=False

93 )

94 research_id = Column(String(36), nullable=False, index=True)

96 added_at = Column(UtcDateTime, default=utcnow(), nullable=False)

98 __table_args__ = (

99 UniqueConstraint("url_hash", "resource_id", name="uix_url_resource"),

100 Index("idx_research_duplicates", "research_id", "url_hash"),

101 )

102

103 def __repr__(self):

104 return f"<DownloadDuplicates(url_hash={self.url_hash[:8]}..., resource_id={self.resource_id})>"

105

106

107class DownloadAttempt(Base):

108 """

109 Log of download attempts for debugging and retry logic.

110 """

111

112 __tablename__ = "download_attempts"

113

114 id = Column(Integer, primary_key=True, autoincrement=True)

115 url_hash = Column(

116 String(64),

117 ForeignKey("download_tracker.url_hash"),

118 nullable=False,

119 index=True,

120 )

121

122 # Attempt details

123 attempt_number = Column(Integer, nullable=False)

124 status_code = Column(Integer, nullable=True) # HTTP status code

125 error_type = Column(String(100), nullable=True) # timeout, connection, etc.

126 error_message = Column(Text, nullable=True)

127

128 # Timing

129 attempted_at = Column(UtcDateTime, default=utcnow(), nullable=False)

130 duration_ms = Column(Integer, nullable=True)

131

132 # Success tracking

133 succeeded = Column(Boolean, default=False, nullable=False)

134 bytes_downloaded = Column(Integer, nullable=True)

135

136 def __repr__(self):

137 status = (

138 "success"

139 if self.succeeded

140 else f"failed ({self.status_code or self.error_type})"

141 )

142 return (

143 f"<DownloadAttempt(attempt={self.attempt_number}, status={status})>"

144 )

Coverage for src / local_deep_research / database / models / download_tracker.py: 89%

47 statements