Coverage for src / local_deep_research / library / download_management / failure_classifier.py: 95%
81 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Failure Classification System with Inheritance
4Provides base classes and specific failure types for download attempts.
5Uses inheritance to organize different failure categories and their retry behavior.
6"""
8from abc import ABC
9from datetime import datetime, timedelta, UTC
10from typing import Optional
11from urllib.parse import urlparse
13from loguru import logger
16class BaseFailure(ABC):
17 """Base class for all failure types with common functionality"""
19 def __init__(
20 self,
21 error_type: str,
22 message: str,
23 retry_after: Optional[timedelta] = None,
24 ):
25 """
26 Initialize a failure classification.
28 Args:
29 error_type: Machine-readable error identifier
30 message: Human-readable error description
31 retry_after: When this failure can be retried (None = permanent)
32 """
33 self.error_type = error_type
34 self.message = message
35 self.retry_after = retry_after
36 self.created_at = datetime.now(UTC)
38 logger.debug(
39 f"Created {self.__class__.__name__}: {error_type} - {message}"
40 )
42 def is_permanent(self) -> bool:
43 """Check if this is a permanent failure (never retry)"""
44 return self.retry_after is None
46 def can_retry_now(self) -> bool:
47 """Check if this resource can be retried right now"""
48 if self.retry_after is None:
49 return False
50 return datetime.now(UTC) >= self.created_at + self.retry_after
52 def get_cooldown_remaining(self) -> Optional[timedelta]:
53 """Get remaining cooldown time, or None if no cooldown"""
54 if self.retry_after is None:
55 return None
57 retry_time = self.created_at + self.retry_after
58 if datetime.now(UTC) < retry_time:
59 return retry_time - datetime.now(UTC)
60 return None
62 def to_dict(self) -> dict:
63 """Convert to dictionary for database storage"""
64 return {
65 "error_type": self.error_type,
66 "message": self.message,
67 "retry_after_timestamp": self.created_at + self.retry_after
68 if self.retry_after
69 else None,
70 "is_permanent": self.is_permanent(),
71 "created_at": self.created_at,
72 }
75class PermanentFailure(BaseFailure):
76 """Resources that should never be retried"""
78 def __init__(self, error_type: str, message: str):
79 super().__init__(error_type, message, retry_after=None)
82class TemporaryFailure(BaseFailure):
83 """Resources that can be retried after cooldown"""
85 def __init__(self, error_type: str, message: str, cooldown: timedelta):
86 super().__init__(error_type, message, retry_after=cooldown)
89class RateLimitFailure(TemporaryFailure):
90 """Domain-specific rate limit handling with longer cooldowns"""
92 def __init__(self, domain: str, details: str = ""):
93 # Domain-specific cooldown periods
94 domain_cooldowns = {
95 "arxiv.org": timedelta(
96 hours=6
97 ), # General arXiv rate limiting (reCAPTCHA handled separately)
98 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits
99 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits
100 "semanticscholar.org": timedelta(
101 hours=4
102 ), # Semantic Scholar rate limits
103 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits
104 "default": timedelta(
105 hours=1
106 ), # Default cooldown for unknown domains
107 }
109 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"])
110 message = f"Rate limited by {domain}"
111 if details:
112 message += f" - {details}"
114 super().__init__("rate_limited", message, cooldown)
115 self.domain = domain
118class FailureClassifier:
119 """Classifies download failures into appropriate types based on error patterns"""
121 def classify_failure(
122 self,
123 error_type: str,
124 status_code: Optional[int] = None,
125 url: str = "",
126 details: str = "",
127 ) -> BaseFailure:
128 """
129 Classify a download failure based on error information.
131 Args:
132 error_type: Error type identifier
133 status_code: HTTP status code if available
134 url: URL that failed
135 details: Additional error details
137 Returns:
138 Appropriate failure classification
139 """
140 # HTTP Status Code classifications
141 if status_code:
142 if status_code == 404:
143 return PermanentFailure("not_found", "Resource not found (404)")
144 if status_code == 403:
145 return PermanentFailure("forbidden", "Access forbidden (403)")
146 if status_code == 410:
147 return PermanentFailure(
148 "gone", "Resource permanently removed (410)"
149 )
150 if status_code == 429:
151 domain = urlparse(url).netloc if url else "unknown"
152 return RateLimitFailure(domain, details)
153 if status_code == 503:
154 return TemporaryFailure(
155 "server_error",
156 "Service temporarily unavailable (503)",
157 timedelta(hours=1),
158 )
160 # Error message pattern classifications
161 error_lower = error_type.lower()
162 details_lower = details.lower()
164 # arXiv specific patterns
165 if "arxiv" in error_lower or "arxiv" in details_lower:
166 if "recaptcha" in details_lower or "captcha" in details_lower:
167 return TemporaryFailure(
168 "recaptcha_protection",
169 "Anti-bot protection active, retry after 3 days",
170 timedelta(days=3),
171 )
172 if "not a pdf file" in details_lower:
173 return PermanentFailure(
174 "incompatible_format", "Content is not a PDF file"
175 )
176 if (
177 "html" in details_lower
178 and "application/pdf" not in details_lower
179 ):
180 return PermanentFailure(
181 "incompatible_format",
182 "Content returned HTML instead of PDF",
183 )
185 # Login/subscription/paywall errors — these are permanent
186 if any( 186 ↛ 195line 186 didn't jump to line 195 because the condition on line 186 was never true
187 pattern in details_lower
188 for pattern in [
189 "requires login",
190 "subscription",
191 "paywall",
192 "requires authentication",
193 ]
194 ):
195 return PermanentFailure(
196 "paywall_or_login",
197 "Requires login or subscription",
198 )
200 # HTML content instead of PDF (non-arXiv) — permanent
201 if "text/html" in details_lower and ( 201 ↛ 205line 201 didn't jump to line 205 because the condition on line 201 was never true
202 "unexpected content type" in details_lower
203 or "not a pdf" in details_lower
204 ):
205 return PermanentFailure(
206 "incompatible_format",
207 "Content returned HTML instead of PDF",
208 )
210 # Access denied patterns in error messages (without status code)
211 if ( 211 ↛ 216line 211 didn't jump to line 216 because the condition on line 211 was never true
212 "access denied" in details_lower
213 or "403 forbidden" in details_lower
214 or "http 403" in details_lower
215 ):
216 return PermanentFailure(
217 "forbidden", "Access denied based on error message"
218 )
220 # Common timeout and network errors
221 if "timeout" in error_lower or "timed out" in details_lower:
222 return TemporaryFailure(
223 "timeout", "Request timed out", timedelta(minutes=30)
224 )
225 if "connection" in error_lower or "network" in error_lower:
226 return TemporaryFailure(
227 "network_error",
228 "Network connectivity issue",
229 timedelta(minutes=5),
230 )
232 # Default to temporary failure with 1-hour cooldown
233 logger.warning(
234 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}"
235 )
236 return TemporaryFailure(
237 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1)
238 )
240 def classify_from_exception(
241 self, exception: Exception, url: str = ""
242 ) -> BaseFailure:
243 """Classify failure from exception object"""
244 error_type = type(exception).__name__
245 details = str(exception)
246 return self.classify_failure(error_type, details=details, url=url)