Coverage for src / local_deep_research / library / download_management / failure_classifier.py: 60%
75 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Failure Classification System with Inheritance
4Provides base classes and specific failure types for download attempts.
5Uses inheritance to organize different failure categories and their retry behavior.
6"""
8from abc import ABC
9from datetime import datetime, timedelta, UTC
10from typing import Optional
11from urllib.parse import urlparse
13from loguru import logger
16class BaseFailure(ABC):
17 """Base class for all failure types with common functionality"""
19 def __init__(
20 self,
21 error_type: str,
22 message: str,
23 retry_after: Optional[timedelta] = None,
24 ):
25 """
26 Initialize a failure classification.
28 Args:
29 error_type: Machine-readable error identifier
30 message: Human-readable error description
31 retry_after: When this failure can be retried (None = permanent)
32 """
33 self.error_type = error_type
34 self.message = message
35 self.retry_after = retry_after
36 self.created_at = datetime.now(UTC)
38 logger.debug(
39 f"Created {self.__class__.__name__}: {error_type} - {message}"
40 )
42 def is_permanent(self) -> bool:
43 """Check if this is a permanent failure (never retry)"""
44 return self.retry_after is None
46 def can_retry_now(self) -> bool:
47 """Check if this resource can be retried right now"""
48 if self.is_permanent():
49 return False
50 return datetime.now(UTC) >= self.created_at + self.retry_after
52 def get_cooldown_remaining(self) -> Optional[timedelta]:
53 """Get remaining cooldown time, or None if no cooldown"""
54 if self.is_permanent():
55 return None
57 retry_time = self.created_at + self.retry_after
58 if datetime.now(UTC) < retry_time:
59 return retry_time - datetime.now(UTC)
60 return None
62 def to_dict(self) -> dict:
63 """Convert to dictionary for database storage"""
64 return {
65 "error_type": self.error_type,
66 "message": self.message,
67 "retry_after_timestamp": self.created_at + self.retry_after
68 if self.retry_after
69 else None,
70 "is_permanent": self.is_permanent(),
71 "created_at": self.created_at,
72 }
75class PermanentFailure(BaseFailure):
76 """Resources that should never be retried"""
78 def __init__(self, error_type: str, message: str):
79 super().__init__(error_type, message, retry_after=None)
82class TemporaryFailure(BaseFailure):
83 """Resources that can be retried after cooldown"""
85 def __init__(self, error_type: str, message: str, cooldown: timedelta):
86 super().__init__(error_type, message, retry_after=cooldown)
89class RateLimitFailure(TemporaryFailure):
90 """Domain-specific rate limit handling with longer cooldowns"""
92 def __init__(self, domain: str, details: str = ""):
93 # Domain-specific cooldown periods
94 domain_cooldowns = {
95 "arxiv.org": timedelta(
96 hours=6
97 ), # General arXiv rate limiting (reCAPTCHA handled separately)
98 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits
99 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits
100 "semanticscholar.org": timedelta(
101 hours=4
102 ), # Semantic Scholar rate limits
103 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits
104 "default": timedelta(
105 hours=1
106 ), # Default cooldown for unknown domains
107 }
109 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"])
110 message = f"Rate limited by {domain}"
111 if details: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true
112 message += f" - {details}"
114 super().__init__("rate_limited", message, cooldown)
115 self.domain = domain
118class FailureClassifier:
119 """Classifies download failures into appropriate types based on error patterns"""
121 def classify_failure(
122 self,
123 error_type: str,
124 status_code: Optional[int] = None,
125 url: str = "",
126 details: str = "",
127 ) -> BaseFailure:
128 """
129 Classify a download failure based on error information.
131 Args:
132 error_type: Error type identifier
133 status_code: HTTP status code if available
134 url: URL that failed
135 details: Additional error details
137 Returns:
138 Appropriate failure classification
139 """
140 # HTTP Status Code classifications
141 if status_code:
142 if status_code == 404:
143 return PermanentFailure("not_found", "Resource not found (404)")
144 elif status_code == 403:
145 return PermanentFailure("forbidden", "Access forbidden (403)")
146 elif status_code == 410: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 return PermanentFailure(
148 "gone", "Resource permanently removed (410)"
149 )
150 elif status_code == 429: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 domain = urlparse(url).netloc if url else "unknown"
152 return RateLimitFailure(domain, details)
153 elif status_code == 503:
154 return TemporaryFailure(
155 "server_error",
156 "Service temporarily unavailable (503)",
157 timedelta(hours=1),
158 )
160 # Error message pattern classifications
161 error_lower = error_type.lower()
162 details_lower = details.lower()
164 # arXiv specific patterns
165 if "arxiv" in error_lower or "arxiv" in details_lower:
166 if "recaptcha" in details_lower or "captcha" in details_lower: 166 ↛ 172line 166 didn't jump to line 172 because the condition on line 166 was always true
167 return TemporaryFailure(
168 "recaptcha_protection",
169 "Anti-bot protection active, retry after 3 days",
170 timedelta(days=3),
171 )
172 if "not a pdf file" in details_lower:
173 return PermanentFailure(
174 "incompatible_format", "Content is not a PDF file"
175 )
176 if (
177 "html" in details_lower
178 and "application/pdf" not in details_lower
179 ):
180 return PermanentFailure(
181 "incompatible_format",
182 "Content returned HTML instead of PDF",
183 )
185 # Common timeout and network errors
186 if "timeout" in error_lower: 186 ↛ 190line 186 didn't jump to line 190 because the condition on line 186 was always true
187 return TemporaryFailure(
188 "timeout", "Request timed out", timedelta(minutes=30)
189 )
190 if "connection" in error_lower or "network" in error_lower:
191 return TemporaryFailure(
192 "network_error",
193 "Network connectivity issue",
194 timedelta(minutes=5),
195 )
197 # Default to temporary failure with 1-hour cooldown
198 logger.warning(
199 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}"
200 )
201 return TemporaryFailure(
202 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1)
203 )
205 def classify_from_exception(
206 self, exception: Exception, url: str = ""
207 ) -> BaseFailure:
208 """Classify failure from exception object"""
209 error_type = type(exception).__name__
210 details = str(exception)
211 return self.classify_failure(error_type, details=details, url=url)