Coverage for src/local_deep_research/library/download_management/failure_classifier.py: 94%
68 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Failure Classification System with Inheritance
4Provides base classes and specific failure types for download attempts.
5Uses inheritance to organize different failure categories and their retry behavior.
6"""
8from abc import ABC
9from datetime import datetime, timedelta, UTC
10from typing import Optional
11from urllib.parse import urlparse
13from loguru import logger
16class BaseFailure(ABC):
17 """Base class for all failure types with common functionality"""
19 def __init__(
20 self,
21 error_type: str,
22 message: str,
23 retry_after: Optional[timedelta] = None,
24 ):
25 """
26 Initialize a failure classification.
28 Args:
29 error_type: Machine-readable error identifier
30 message: Human-readable error description
31 retry_after: When this failure can be retried (None = permanent)
32 """
33 self.error_type = error_type
34 self.message = message
35 self.retry_after = retry_after
36 self.created_at = datetime.now(UTC)
38 logger.debug(
39 f"Created {self.__class__.__name__}: {error_type} - {message}"
40 )
42 def is_permanent(self) -> bool:
43 """Check if this is a permanent failure (never retry)"""
44 return self.retry_after is None
47class PermanentFailure(BaseFailure):
48 """Resources that should never be retried"""
50 def __init__(self, error_type: str, message: str):
51 super().__init__(error_type, message, retry_after=None)
54class TemporaryFailure(BaseFailure):
55 """Resources that can be retried after cooldown"""
57 def __init__(self, error_type: str, message: str, cooldown: timedelta):
58 super().__init__(error_type, message, retry_after=cooldown)
61class RateLimitFailure(TemporaryFailure):
62 """Domain-specific rate limit handling with longer cooldowns"""
64 def __init__(self, domain: str, details: str = ""):
65 # Domain-specific cooldown periods
66 domain_cooldowns = {
67 "arxiv.org": timedelta(
68 hours=6
69 ), # General arXiv rate limiting (reCAPTCHA handled separately)
70 "pubmed.ncbi.nlm.nih.gov": timedelta(hours=2), # PubMed rate limits
71 "biorxiv.org": timedelta(hours=6), # BioRxiv rate limits
72 "semanticscholar.org": timedelta(
73 hours=4
74 ), # Semantic Scholar rate limits
75 "researchgate.net": timedelta(hours=12), # ResearchGate rate limits
76 "default": timedelta(
77 hours=1
78 ), # Default cooldown for unknown domains
79 }
81 cooldown = domain_cooldowns.get(domain, domain_cooldowns["default"])
82 message = f"Rate limited by {domain}"
83 if details:
84 message += f" - {details}"
86 super().__init__("rate_limited", message, cooldown)
87 self.domain = domain
90class FailureClassifier:
91 """Classifies download failures into appropriate types based on error patterns"""
93 def classify_failure(
94 self,
95 error_type: str,
96 status_code: Optional[int] = None,
97 url: str = "",
98 details: str = "",
99 ) -> BaseFailure:
100 """
101 Classify a download failure based on error information.
103 Args:
104 error_type: Error type identifier
105 status_code: HTTP status code if available
106 url: URL that failed
107 details: Additional error details
109 Returns:
110 Appropriate failure classification
111 """
112 # HTTP Status Code classifications
113 if status_code:
114 if status_code == 404:
115 return PermanentFailure("not_found", "Resource not found (404)")
116 if status_code == 403:
117 return PermanentFailure("forbidden", "Access forbidden (403)")
118 if status_code == 410:
119 return PermanentFailure(
120 "gone", "Resource permanently removed (410)"
121 )
122 if status_code == 429:
123 domain = urlparse(url).netloc if url else "unknown"
124 return RateLimitFailure(domain, details)
125 if status_code == 503:
126 return TemporaryFailure(
127 "server_error",
128 "Service temporarily unavailable (503)",
129 timedelta(hours=1),
130 )
132 # Error message pattern classifications
133 error_lower = error_type.lower()
134 details_lower = details.lower()
136 # arXiv specific patterns
137 if "arxiv" in error_lower or "arxiv" in details_lower:
138 if "recaptcha" in details_lower or "captcha" in details_lower:
139 return TemporaryFailure(
140 "recaptcha_protection",
141 "Anti-bot protection active, retry after 3 days",
142 timedelta(days=3),
143 )
144 if "not a pdf file" in details_lower:
145 return PermanentFailure(
146 "incompatible_format", "Content is not a PDF file"
147 )
148 if (
149 "html" in details_lower
150 and "application/pdf" not in details_lower
151 ):
152 return PermanentFailure(
153 "incompatible_format",
154 "Content returned HTML instead of PDF",
155 )
157 # Login/subscription/paywall errors — these are permanent
158 if any( 158 ↛ 167line 158 didn't jump to line 167 because the condition on line 158 was never true
159 pattern in details_lower
160 for pattern in [
161 "requires login",
162 "subscription",
163 "paywall",
164 "requires authentication",
165 ]
166 ):
167 return PermanentFailure(
168 "paywall_or_login",
169 "Requires login or subscription",
170 )
172 # HTML content instead of PDF (non-arXiv) — permanent
173 if "text/html" in details_lower and ( 173 ↛ 177line 173 didn't jump to line 177 because the condition on line 173 was never true
174 "unexpected content type" in details_lower
175 or "not a pdf" in details_lower
176 ):
177 return PermanentFailure(
178 "incompatible_format",
179 "Content returned HTML instead of PDF",
180 )
182 # Access denied patterns in error messages (without status code)
183 if ( 183 ↛ 188line 183 didn't jump to line 188 because the condition on line 183 was never true
184 "access denied" in details_lower
185 or "403 forbidden" in details_lower
186 or "http 403" in details_lower
187 ):
188 return PermanentFailure(
189 "forbidden", "Access denied based on error message"
190 )
192 # Common timeout and network errors
193 if "timeout" in error_lower or "timed out" in details_lower:
194 return TemporaryFailure(
195 "timeout", "Request timed out", timedelta(minutes=30)
196 )
197 if "connection" in error_lower or "network" in error_lower:
198 return TemporaryFailure(
199 "network_error",
200 "Network connectivity issue",
201 timedelta(minutes=5),
202 )
204 # Default to temporary failure with 1-hour cooldown
205 logger.warning(
206 f"[FAILURE_CLASSIFIER] Unclassified error: {error_type} - {details}"
207 )
208 return TemporaryFailure(
209 "unknown_error", f"Unknown error: {error_type}", timedelta(hours=1)
210 )
212 def classify_from_exception(
213 self, exception: Exception, url: str = ""
214 ) -> BaseFailure:
215 """Classify failure from exception object"""
216 error_type = type(exception).__name__
217 details = str(exception)
218 return self.classify_failure(error_type, details=details, url=url)