Coverage for src / local_deep_research / content_fetcher / fetcher.py: 92%
129 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Unified Content Fetcher.
4Provides a single interface to fetch content from various sources:
5- Academic papers (arXiv, PubMed, Semantic Scholar)
6- Web pages (HTML)
7- Direct PDF links
8"""
10from typing import Any, Dict, List, Optional
11from loguru import logger
13from .url_classifier import URLClassifier, URLType
14from ..research_library.downloaders.base import ContentType
15from ..security.ssrf_validator import validate_url
16from ..utilities.resource_utils import safe_close
18# Default maximum content length (500KB of text)
19DEFAULT_MAX_CONTENT_LENGTH = 500_000
21# URL types where HTML fallback is pointless when the specialized downloader fails
22_NO_HTML_FALLBACK = {URLType.HTML, URLType.DOI, URLType.INVALID, URLType.PDF}
25class ContentFetcher:
26 """
27 Unified content fetcher that routes to appropriate downloaders.
29 Automatically detects URL type and uses the best downloader.
30 """
32 def __init__(self, timeout: int = 30, language: str = "English"):
33 """
34 Initialize the content fetcher.
36 Args:
37 timeout: Request timeout in seconds
38 language: Language for justext stoplist (passed to HTML downloader)
39 """
40 self.timeout = timeout
41 self.language = language
42 self._downloaders: Dict[URLType, Any] = {}
44 def _get_downloader(self, url_type: URLType):
45 """Get or create the appropriate downloader for a URL type."""
46 if url_type in self._downloaders:
47 return self._downloaders[url_type]
49 downloader: Any = None
51 if url_type == URLType.ARXIV:
52 try:
53 from ..research_library.downloaders.arxiv import ArxivDownloader
55 downloader = ArxivDownloader(timeout=self.timeout)
56 except ImportError:
57 logger.warning("ArxivDownloader not available")
59 elif url_type in (URLType.PUBMED, URLType.PMC):
60 try:
61 from ..research_library.downloaders.pubmed import (
62 PubMedDownloader,
63 )
65 downloader = PubMedDownloader(timeout=self.timeout)
66 except ImportError:
67 logger.warning("PubMedDownloader not available")
69 elif url_type == URLType.SEMANTIC_SCHOLAR:
70 try:
71 from ..research_library.downloaders.semantic_scholar import (
72 SemanticScholarDownloader,
73 )
75 downloader = SemanticScholarDownloader(timeout=self.timeout)
76 except ImportError:
77 logger.warning("SemanticScholarDownloader not available")
79 elif url_type in (URLType.BIORXIV, URLType.MEDRXIV):
80 try:
81 from ..research_library.downloaders.biorxiv import (
82 BioRxivDownloader,
83 )
85 downloader = BioRxivDownloader(timeout=self.timeout)
86 except ImportError:
87 logger.warning("BioRxivDownloader not available")
89 elif url_type == URLType.PDF:
90 try:
91 from ..research_library.downloaders.direct_pdf import (
92 DirectPDFDownloader,
93 )
95 downloader = DirectPDFDownloader(timeout=self.timeout)
96 except ImportError:
97 logger.warning("DirectPDFDownloader not available")
99 elif url_type == URLType.HTML:
100 try:
101 from ..research_library.downloaders.playwright_html import (
102 AutoHTMLDownloader as HTMLDownloader,
103 )
105 downloader = HTMLDownloader(
106 timeout=self.timeout, language=self.language
107 )
108 except ImportError:
109 logger.warning("HTMLDownloader not available")
111 elif url_type == URLType.DOI:
112 # DOI URLs typically redirect to publisher pages
113 # Use HTML downloader as fallback
114 try:
115 from ..research_library.downloaders.playwright_html import (
116 AutoHTMLDownloader as HTMLDownloader,
117 )
119 downloader = HTMLDownloader(
120 timeout=self.timeout, language=self.language
121 )
122 except ImportError:
123 logger.warning("HTMLDownloader not available")
125 # Cache the downloader
126 if downloader:
127 self._downloaders[url_type] = downloader
129 return downloader
131 def fetch(
132 self,
133 url: str,
134 max_length: Optional[int] = None,
135 prefer_text: bool = True,
136 ) -> Dict[str, Any]:
137 """
138 Fetch content from a URL.
140 Automatically detects the URL type and uses the appropriate downloader.
142 Args:
143 url: The URL to fetch content from
144 max_length: Maximum content length to return (chars). Defaults to 500KB.
145 prefer_text: If True, prefer text extraction over PDF download
147 Returns:
148 Dict with:
149 - status: "success" or "error"
150 - content: Extracted text content
151 - url: Original URL
152 - source_type: Type of source (arxiv, pubmed, html, etc.)
153 - title: Title if available
154 - error: Error message if failed
155 """
156 # Apply default max_length if not specified
157 if max_length is None:
158 max_length = DEFAULT_MAX_CONTENT_LENGTH
160 # Classify the URL
161 url_type = URLClassifier.classify(url)
162 source_name = URLClassifier.get_source_name(url_type)
164 # Reject invalid/dangerous URLs
165 if url_type == URLType.INVALID:
166 return {
167 "status": "error",
168 "url": url,
169 "source_type": source_name,
170 "error": "Invalid or unsupported URL scheme (only http/https allowed)",
171 }
173 # SSRF validation: reject private/internal IPs before reaching downloaders
174 if not validate_url(url):
175 logger.warning(f"URL failed SSRF validation: {url}")
176 return {
177 "status": "error",
178 "url": url,
179 "source_type": source_name,
180 "error": "URL failed security validation (blocked by SSRF protection)",
181 }
183 logger.info(f"Fetching content from {url} (detected: {source_name})")
185 # Get the appropriate downloader
186 downloader = self._get_downloader(url_type)
188 if not downloader:
189 # Fall back to generic HTML downloader. This triggers when a
190 # specialized downloader (ArXiv, SemanticScholar, etc.) failed
191 # to import — playwright_html may still be available.
192 # Use _get_downloader so the instance is cached and cleaned up
193 # by close().
194 downloader = self._get_downloader(URLType.HTML)
195 if not downloader:
196 return {
197 "status": "error",
198 "url": url,
199 "source_type": source_name,
200 "error": "No suitable downloader available",
201 }
203 # Determine content type
204 content_type = ContentType.TEXT if prefer_text else ContentType.PDF
206 # Download content
207 try:
208 result = downloader.download_with_result(url, content_type)
210 # HTML fallback: when a specialized downloader fails (e.g.
211 # arXiv PDF unavailable, PubMed paywalled), try generic HTML
212 # extraction — the abstract/landing page often has useful content.
213 if not result.is_success and url_type not in _NO_HTML_FALLBACK:
214 logger.debug(
215 f"Specialized downloader failed for {url}, "
216 "trying HTML fallback"
217 )
218 html_downloader = self._get_downloader(URLType.HTML)
219 if html_downloader: 219 ↛ 228line 219 didn't jump to line 228 because the condition on line 219 was always true
220 result = html_downloader.download_with_result(
221 url, content_type
222 )
223 # Use the HTML downloader for metadata too, so we
224 # don't call the failed specialized downloader's
225 # get_metadata (which would re-fetch or return wrong data).
226 downloader = html_downloader
228 if result.is_success and result.content:
229 # Decode content — check PDF magic bytes first, then try
230 # UTF-8, and reject anything that is neither.
231 if result.content[:4] == b"%PDF":
232 from ..research_library.downloaders.base import (
233 BaseDownloader,
234 )
236 content = BaseDownloader.extract_text_from_pdf(
237 result.content
238 )
239 if not content:
240 return {
241 "status": "error",
242 "url": url,
243 "source_type": source_name,
244 "error": "Could not extract text from PDF",
245 }
246 else:
247 try:
248 content = result.content.decode("utf-8")
249 except UnicodeDecodeError:
250 return {
251 "status": "error",
252 "url": url,
253 "source_type": source_name,
254 "error": "Content is not valid UTF-8 and not a PDF",
255 }
257 # Truncate if needed
258 if max_length and len(content) > max_length:
259 content = (
260 content[:max_length] + "\n\n[... content truncated ...]"
261 )
263 # Try to get metadata
264 metadata = {}
265 if hasattr(downloader, "get_metadata"):
266 try:
267 metadata = downloader.get_metadata(url)
268 except Exception:
269 logger.debug(
270 "Failed to fetch metadata for {}",
271 url,
272 exc_info=True,
273 )
275 return {
276 "status": "success",
277 "content": content,
278 "url": url,
279 "source_type": source_name,
280 "title": metadata.get("title"),
281 "author": metadata.get("author"),
282 "published_date": metadata.get("published_date"),
283 }
285 return {
286 "status": "error",
287 "url": url,
288 "source_type": source_name,
289 "error": result.skip_reason or "Download failed",
290 }
292 except Exception as e:
293 logger.exception(f"Error fetching content from {url}")
294 return {
295 "status": "error",
296 "url": url,
297 "source_type": source_name,
298 "error": str(e),
299 }
301 def fetch_text(
302 self, url: str, max_length: Optional[int] = None
303 ) -> Optional[str]:
304 """
305 Convenience method to fetch just the text content.
307 Args:
308 url: The URL to fetch
309 max_length: Maximum content length
311 Returns:
312 Text content or None if failed
313 """
314 result = self.fetch(url, max_length=max_length, prefer_text=True)
315 if result.get("status") == "success":
316 return result.get("content")
317 return None
319 def fetch_batch(self, urls: List[str]) -> Dict[str, Optional[str]]:
320 """Fetch multiple URLs, routing each to the best downloader.
322 Specialized downloaders (arXiv, PubMed, etc.) are tried first;
323 generic HTML extraction is used as fallback. Downloaders are
324 cached by URL type, so a single Playwright browser is shared
325 across all HTML URLs.
327 Returns:
328 Dict mapping URL → extracted text (or None if failed).
329 """
330 return {url: self.fetch_text(url) for url in urls}
332 def get_url_info(self, url: str) -> Dict[str, Any]:
333 """
334 Get information about a URL without downloading.
336 Args:
337 url: The URL to analyze
339 Returns:
340 Dict with url_type, source_name, and extracted_id
341 """
342 url_type = URLClassifier.classify(url)
343 return {
344 "url": url,
345 "url_type": url_type.value,
346 "source_name": URLClassifier.get_source_name(url_type),
347 "extracted_id": URLClassifier.extract_id(url, url_type),
348 }
350 def close(self):
351 """Close all cached downloaders and their HTTP sessions."""
352 for url_type, downloader in self._downloaders.items():
353 safe_close(downloader, f"downloader-{url_type.value}")
354 self._downloaders.clear()
356 def __enter__(self):
357 return self
359 def __exit__(self, exc_type, exc_val, exc_tb):
360 self.close()
361 return False