Coverage for src/local_deep_research/library/download_management/failure_classifier.py: 94%

68 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Failure Classification System with Inheritance 

3 

4Provides base classes and specific failure types for download attempts. 

5Uses inheritance to organize different failure categories and their retry behavior. 

6""" 

7 

8from abc import ABC 

9from datetime import datetime, timedelta, UTC 

10from typing import Optional 

11from urllib.parse import urlparse 

12 

13from loguru import logger 

14 

15 

16class BaseFailure(ABC): 

17 """Base class for all failure types with common functionality""" 

18 

19 def __init__( 

20 self, 

21 error_type: str, 

22 message: str, 

23 retry_after: Optional[timedelta] = None, 

24 ): 

25 """ 

26 Initialize a failure classification. 

27 

28 Args: 

29 error_type: Machine-readable error identifier 

30 message: Human-readable error description 

31 retry_after: When this failure can be retried (None = permanent) 

32 """ 

33 self.error_type = error_type 

34 self.message = message 

35 self.retry_after = retry_after 

36 self.created_at = datetime.now(UTC) 

37 

38 logger.debug( 

39 f"Created {self.__class__.__name__}: {error_type} - {message}" 

40 ) 

41 

42 def is_permanent(self) -> bool: 

43 """Check if this is a permanent failure (never retry)""" 

44 return self.retry_after is None 

45 

46 

47class PermanentFailure(BaseFailure): 

48 """Resources that should never be retried""" 

49 

50 def __init__(self, error_type: str, message: str): 

51 super().__init__(error_type, message, retry_after=None) 

52 

53 

54class TemporaryFailure(BaseFailure): 

55 """Resources that can be retried after cooldown""" 

56 

57 def __init__(self, error_type: str, message: str, cooldown: timedelta): 

58 super().__init__(error_type, message, retry_after=cooldown) 

59 

60 

61class RateLimitFailure(TemporaryFailure): 

62 """Domain-specific rate limit handling with longer cooldowns""" 

63 

64 def __init__(self, domain: str, details: str = ""): 

65 # Domain-specific cooldown periods 

66 domain_cooldowns = { 

67 "arxiv.org": timedelta( 

68 hours=6 

69 ), # General arXiv rate limiting (reCAPTCHA handled separately) 

70 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits 

71 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits 

72 "semanticscholar.org": timedelta( 

73 hours=4 

74 ), # Semantic Scholar rate limits 

75 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits 

76 "default": timedelta( 

77 hours=1 

78 ), # Default cooldown for unknown domains 

79 } 

80 

81 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"]) 

82 message = f"Rate limited by {domain}" 

83 if details: 

84 message += f" - {details}" 

85 

86 super().__init__("rate_limited", message, cooldown) 

87 self.domain = domain 

88 

89 

90class FailureClassifier: 

91 """Classifies download failures into appropriate types based on error patterns""" 

92 

93 def classify_failure( 

94 self, 

95 error_type: str, 

96 status_code: Optional[int] = None, 

97 url: str = "", 

98 details: str = "", 

99 ) -> BaseFailure: 

100 """ 

101 Classify a download failure based on error information. 

102 

103 Args: 

104 error_type: Error type identifier 

105 status_code: HTTP status code if available 

106 url: URL that failed 

107 details: Additional error details 

108 

109 Returns: 

110 Appropriate failure classification 

111 """ 

112 # HTTP Status Code classifications 

113 if status_code: 

114 if status_code == 404: 

115 return PermanentFailure("not_found", "Resource not found (404)") 

116 if status_code == 403: 

117 return PermanentFailure("forbidden", "Access forbidden (403)") 

118 if status_code == 410: 

119 return PermanentFailure( 

120 "gone", "Resource permanently removed (410)" 

121 ) 

122 if status_code == 429: 

123 domain = urlparse(url).netloc if url else "unknown" 

124 return RateLimitFailure(domain, details) 

125 if status_code == 503: 

126 return TemporaryFailure( 

127 "server_error", 

128 "Service temporarily unavailable (503)", 

129 timedelta(hours=1), 

130 ) 

131 

132 # Error message pattern classifications 

133 error_lower = error_type.lower() 

134 details_lower = details.lower() 

135 

136 # arXiv specific patterns 

137 if "arxiv" in error_lower or "arxiv" in details_lower: 

138 if "recaptcha" in details_lower or "captcha" in details_lower: 

139 return TemporaryFailure( 

140 "recaptcha_protection", 

141 "Anti-bot protection active, retry after 3 days", 

142 timedelta(days=3), 

143 ) 

144 if "not a pdf file" in details_lower: 

145 return PermanentFailure( 

146 "incompatible_format", "Content is not a PDF file" 

147 ) 

148 if ( 

149 "html" in details_lower 

150 and "application/pdf" not in details_lower 

151 ): 

152 return PermanentFailure( 

153 "incompatible_format", 

154 "Content returned HTML instead of PDF", 

155 ) 

156 

157 # Login/subscription/paywall errors — these are permanent 

158 if any( 158 ↛ 167line 158 didn't jump to line 167 because the condition on line 158 was never true

159 pattern in details_lower 

160 for pattern in [ 

161 "requires login", 

162 "subscription", 

163 "paywall", 

164 "requires authentication", 

165 ] 

166 ): 

167 return PermanentFailure( 

168 "paywall_or_login", 

169 "Requires login or subscription", 

170 ) 

171 

172 # HTML content instead of PDF (non-arXiv) — permanent 

173 if "text/html" in details_lower and ( 173 ↛ 177line 173 didn't jump to line 177 because the condition on line 173 was never true

174 "unexpected content type" in details_lower 

175 or "not a pdf" in details_lower 

176 ): 

177 return PermanentFailure( 

178 "incompatible_format", 

179 "Content returned HTML instead of PDF", 

180 ) 

181 

182 # Access denied patterns in error messages (without status code) 

183 if ( 183 ↛ 188line 183 didn't jump to line 188 because the condition on line 183 was never true

184 "access denied" in details_lower 

185 or "403 forbidden" in details_lower 

186 or "http 403" in details_lower 

187 ): 

188 return PermanentFailure( 

189 "forbidden", "Access denied based on error message" 

190 ) 

191 

192 # Common timeout and network errors 

193 if "timeout" in error_lower or "timed out" in details_lower: 

194 return TemporaryFailure( 

195 "timeout", "Request timed out", timedelta(minutes=30) 

196 ) 

197 if "connection" in error_lower or "network" in error_lower: 

198 return TemporaryFailure( 

199 "network_error", 

200 "Network connectivity issue", 

201 timedelta(minutes=5), 

202 ) 

203 

204 # Default to temporary failure with 1-hour cooldown 

205 logger.warning( 

206 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}" 

207 ) 

208 return TemporaryFailure( 

209 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1) 

210 ) 

211 

212 def classify_from_exception( 

213 self, exception: Exception, url: str = "" 

214 ) -> BaseFailure: 

215 """Classify failure from exception object""" 

216 error_type = type(exception).__name__ 

217 details = str(exception) 

218 return self.classify_failure(error_type, details=details, url=url)