Coverage for src/local_deep_research/library/download_management/failure

1"""

2Failure Classification System with Inheritance

4Provides base classes and specific failure types for download attempts.

5Uses inheritance to organize different failure categories and their retry behavior.

6"""

8from abc import ABC

9from datetime import datetime, timedelta, UTC

10from typing import Optional

11from urllib.parse import urlparse

13from loguru import logger

16class BaseFailure(ABC):

17 """Base class for all failure types with common functionality"""

19 def __init__(

20 self,

21 error_type: str,

22 message: str,

23 retry_after: Optional[timedelta] = None,

24 ):

25 """

26 Initialize a failure classification.

28 Args:

29 error_type: Machine-readable error identifier

30 message: Human-readable error description

31 retry_after: When this failure can be retried (None = permanent)

32 """

33 self.error_type = error_type

34 self.message = message

35 self.retry_after = retry_after

36 self.created_at = datetime.now(UTC)

38 logger.debug(

39 f"Created {self.__class__.__name__}: {error_type} - {message}"

40 )

42 def is_permanent(self) -> bool:

43 """Check if this is a permanent failure (never retry)"""

44 return self.retry_after is None

46 def can_retry_now(self) -> bool:

47 """Check if this resource can be retried right now"""

48 if self.is_permanent():

49 return False

50 return datetime.now(UTC) >= self.created_at + self.retry_after

52 def get_cooldown_remaining(self) -> Optional[timedelta]:

53 """Get remaining cooldown time, or None if no cooldown"""

54 if self.is_permanent():

55 return None

57 retry_time = self.created_at + self.retry_after

58 if datetime.now(UTC) < retry_time:

59 return retry_time - datetime.now(UTC)

60 return None

62 def to_dict(self) -> dict:

63 """Convert to dictionary for database storage"""

64 return {

65 "error_type": self.error_type,

66 "message": self.message,

67 "retry_after_timestamp": self.created_at + self.retry_after

68 if self.retry_after

69 else None,

70 "is_permanent": self.is_permanent(),

71 "created_at": self.created_at,

72 }

75class PermanentFailure(BaseFailure):

76 """Resources that should never be retried"""

78 def __init__(self, error_type: str, message: str):

79 super().__init__(error_type, message, retry_after=None)

82class TemporaryFailure(BaseFailure):

83 """Resources that can be retried after cooldown"""

85 def __init__(self, error_type: str, message: str, cooldown: timedelta):

86 super().__init__(error_type, message, retry_after=cooldown)

89class RateLimitFailure(TemporaryFailure):

90 """Domain-specific rate limit handling with longer cooldowns"""

92 def __init__(self, domain: str, details: str = ""):

93 # Domain-specific cooldown periods

94 domain_cooldowns = {

95 "arxiv.org": timedelta(

96 hours=6

97 ), # General arXiv rate limiting (reCAPTCHA handled separately)

98 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits

99 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits

100 "semanticscholar.org": timedelta(

101 hours=4

102 ), # Semantic Scholar rate limits

103 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits

104 "default": timedelta(

105 hours=1

106 ), # Default cooldown for unknown domains

107 }

108

109 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"])

110 message = f"Rate limited by {domain}"

111 if details:

112 message += f" - {details}"

113

114 super().__init__("rate_limited", message, cooldown)

115 self.domain = domain

116

117

118class FailureClassifier:

119 """Classifies download failures into appropriate types based on error patterns"""

120

121 def classify_failure(

122 self,

123 error_type: str,

124 status_code: Optional[int] = None,

125 url: str = "",

126 details: str = "",

127 ) -> BaseFailure:

128 """

129 Classify a download failure based on error information.

130

131 Args:

132 error_type: Error type identifier

133 status_code: HTTP status code if available

134 url: URL that failed

135 details: Additional error details

136

137 Returns:

138 Appropriate failure classification

139 """

140 # HTTP Status Code classifications

141 if status_code:

142 if status_code == 404:

143 return PermanentFailure("not_found", "Resource not found (404)")

144 elif status_code == 403:

145 return PermanentFailure("forbidden", "Access forbidden (403)")

146 elif status_code == 410:

147 return PermanentFailure(

148 "gone", "Resource permanently removed (410)"

149 )

150 elif status_code == 429:

151 domain = urlparse(url).netloc if url else "unknown"

152 return RateLimitFailure(domain, details)

153 elif status_code == 503:

154 return TemporaryFailure(

155 "server_error",

156 "Service temporarily unavailable (503)",

157 timedelta(hours=1),

158 )

159

160 # Error message pattern classifications

161 error_lower = error_type.lower()

162 details_lower = details.lower()

163

164 # arXiv specific patterns

165 if "arxiv" in error_lower or "arxiv" in details_lower:

166 if "recaptcha" in details_lower or "captcha" in details_lower:

167 return TemporaryFailure(

168 "recaptcha_protection",

169 "Anti-bot protection active, retry after 3 days",

170 timedelta(days=3),

171 )

172 if "not a pdf file" in details_lower:

173 return PermanentFailure(

174 "incompatible_format", "Content is not a PDF file"

175 )

176 if (

177 "html" in details_lower

178 and "application/pdf" not in details_lower

179 ):

180 return PermanentFailure(

181 "incompatible_format",

182 "Content returned HTML instead of PDF",

183 )

184

185 # Common timeout and network errors

186 if "timeout" in error_lower:

187 return TemporaryFailure(

188 "timeout", "Request timed out", timedelta(minutes=30)

189 )

190 if "connection" in error_lower or "network" in error_lower:

191 return TemporaryFailure(

192 "network_error",

193 "Network connectivity issue",

194 timedelta(minutes=5),

195 )

196

197 # Default to temporary failure with 1-hour cooldown

198 logger.warning(

199 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}"

200 )

201 return TemporaryFailure(

202 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1)

203 )

204

205 def classify_from_exception(

206 self, exception: Exception, url: str = ""

207 ) -> BaseFailure:

208 """Classify failure from exception object"""

209 error_type = type(exception).__name__

210 details = str(exception)

211 return self.classify_failure(error_type, details=details, url=url)

Coverage for src / local_deep_research / library / download_management / failure_classifier.py: 100%

75 statements