Coverage for src / local_deep_research / library / download_management / failure_classifier.py: 60%

75 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Failure Classification System with Inheritance 

3 

4Provides base classes and specific failure types for download attempts. 

5Uses inheritance to organize different failure categories and their retry behavior. 

6""" 

7 

8from abc import ABC 

9from datetime import datetime, timedelta, UTC 

10from typing import Optional 

11from urllib.parse import urlparse 

12 

13from loguru import logger 

14 

15 

16class BaseFailure(ABC): 

17 """Base class for all failure types with common functionality""" 

18 

19 def __init__( 

20 self, 

21 error_type: str, 

22 message: str, 

23 retry_after: Optional[timedelta] = None, 

24 ): 

25 """ 

26 Initialize a failure classification. 

27 

28 Args: 

29 error_type: Machine-readable error identifier 

30 message: Human-readable error description 

31 retry_after: When this failure can be retried (None = permanent) 

32 """ 

33 self.error_type = error_type 

34 self.message = message 

35 self.retry_after = retry_after 

36 self.created_at = datetime.now(UTC) 

37 

38 logger.debug( 

39 f"Created {self.__class__.__name__}: {error_type} - {message}" 

40 ) 

41 

42 def is_permanent(self) -> bool: 

43 """Check if this is a permanent failure (never retry)""" 

44 return self.retry_after is None 

45 

46 def can_retry_now(self) -> bool: 

47 """Check if this resource can be retried right now""" 

48 if self.is_permanent(): 

49 return False 

50 return datetime.now(UTC) >= self.created_at + self.retry_after 

51 

52 def get_cooldown_remaining(self) -> Optional[timedelta]: 

53 """Get remaining cooldown time, or None if no cooldown""" 

54 if self.is_permanent(): 

55 return None 

56 

57 retry_time = self.created_at + self.retry_after 

58 if datetime.now(UTC) < retry_time: 

59 return retry_time - datetime.now(UTC) 

60 return None 

61 

62 def to_dict(self) -> dict: 

63 """Convert to dictionary for database storage""" 

64 return { 

65 "error_type": self.error_type, 

66 "message": self.message, 

67 "retry_after_timestamp": self.created_at + self.retry_after 

68 if self.retry_after 

69 else None, 

70 "is_permanent": self.is_permanent(), 

71 "created_at": self.created_at, 

72 } 

73 

74 

75class PermanentFailure(BaseFailure): 

76 """Resources that should never be retried""" 

77 

78 def __init__(self, error_type: str, message: str): 

79 super().__init__(error_type, message, retry_after=None) 

80 

81 

82class TemporaryFailure(BaseFailure): 

83 """Resources that can be retried after cooldown""" 

84 

85 def __init__(self, error_type: str, message: str, cooldown: timedelta): 

86 super().__init__(error_type, message, retry_after=cooldown) 

87 

88 

89class RateLimitFailure(TemporaryFailure): 

90 """Domain-specific rate limit handling with longer cooldowns""" 

91 

92 def __init__(self, domain: str, details: str = ""): 

93 # Domain-specific cooldown periods 

94 domain_cooldowns = { 

95 "arxiv.org": timedelta( 

96 hours=6 

97 ), # General arXiv rate limiting (reCAPTCHA handled separately) 

98 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits 

99 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits 

100 "semanticscholar.org": timedelta( 

101 hours=4 

102 ), # Semantic Scholar rate limits 

103 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits 

104 "default": timedelta( 

105 hours=1 

106 ), # Default cooldown for unknown domains 

107 } 

108 

109 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"]) 

110 message = f"Rate limited by {domain}" 

111 if details: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true

112 message += f" - {details}" 

113 

114 super().__init__("rate_limited", message, cooldown) 

115 self.domain = domain 

116 

117 

118class FailureClassifier: 

119 """Classifies download failures into appropriate types based on error patterns""" 

120 

121 def classify_failure( 

122 self, 

123 error_type: str, 

124 status_code: Optional[int] = None, 

125 url: str = "", 

126 details: str = "", 

127 ) -> BaseFailure: 

128 """ 

129 Classify a download failure based on error information. 

130 

131 Args: 

132 error_type: Error type identifier 

133 status_code: HTTP status code if available 

134 url: URL that failed 

135 details: Additional error details 

136 

137 Returns: 

138 Appropriate failure classification 

139 """ 

140 # HTTP Status Code classifications 

141 if status_code: 

142 if status_code == 404: 

143 return PermanentFailure("not_found", "Resource not found (404)") 

144 elif status_code == 403: 

145 return PermanentFailure("forbidden", "Access forbidden (403)") 

146 elif status_code == 410: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 return PermanentFailure( 

148 "gone", "Resource permanently removed (410)" 

149 ) 

150 elif status_code == 429: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 domain = urlparse(url).netloc if url else "unknown" 

152 return RateLimitFailure(domain, details) 

153 elif status_code == 503: 

154 return TemporaryFailure( 

155 "server_error", 

156 "Service temporarily unavailable (503)", 

157 timedelta(hours=1), 

158 ) 

159 

160 # Error message pattern classifications 

161 error_lower = error_type.lower() 

162 details_lower = details.lower() 

163 

164 # arXiv specific patterns 

165 if "arxiv" in error_lower or "arxiv" in details_lower: 

166 if "recaptcha" in details_lower or "captcha" in details_lower: 166 ↛ 172line 166 didn't jump to line 172 because the condition on line 166 was always true

167 return TemporaryFailure( 

168 "recaptcha_protection", 

169 "Anti-bot protection active, retry after 3 days", 

170 timedelta(days=3), 

171 ) 

172 if "not a pdf file" in details_lower: 

173 return PermanentFailure( 

174 "incompatible_format", "Content is not a PDF file" 

175 ) 

176 if ( 

177 "html" in details_lower 

178 and "application/pdf" not in details_lower 

179 ): 

180 return PermanentFailure( 

181 "incompatible_format", 

182 "Content returned HTML instead of PDF", 

183 ) 

184 

185 # Common timeout and network errors 

186 if "timeout" in error_lower: 186 ↛ 190line 186 didn't jump to line 190 because the condition on line 186 was always true

187 return TemporaryFailure( 

188 "timeout", "Request timed out", timedelta(minutes=30) 

189 ) 

190 if "connection" in error_lower or "network" in error_lower: 

191 return TemporaryFailure( 

192 "network_error", 

193 "Network connectivity issue", 

194 timedelta(minutes=5), 

195 ) 

196 

197 # Default to temporary failure with 1-hour cooldown 

198 logger.warning( 

199 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}" 

200 ) 

201 return TemporaryFailure( 

202 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1) 

203 ) 

204 

205 def classify_from_exception( 

206 self, exception: Exception, url: str = "" 

207 ) -> BaseFailure: 

208 """Classify failure from exception object""" 

209 error_type = type(exception).__name__ 

210 details = str(exception) 

211 return self.classify_failure(error_type, details=details, url=url)