Coverage for src/local_deep_research/content_fetcher/fetcher.py: 97%
130 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Unified Content Fetcher.
4Provides a single interface to fetch content from various sources:
5- Academic papers (arXiv, PubMed, Semantic Scholar)
6- Web pages (HTML)
7- Direct PDF links
8"""
10from typing import Any, Dict, List, Optional
11from loguru import logger
13from .url_classifier import URLClassifier, URLType
14from ..research_library.downloaders.base import ContentType
15from ..security.ssrf_validator import validate_url
16from ..utilities.resource_utils import safe_close
18# Default maximum content length (500KB of text)
19DEFAULT_MAX_CONTENT_LENGTH = 500_000
21# URL types where HTML fallback is pointless when the specialized downloader fails
22_NO_HTML_FALLBACK = {URLType.HTML, URLType.DOI, URLType.INVALID, URLType.PDF}
25class ContentFetcher:
26 """
27 Unified content fetcher that routes to appropriate downloaders.
29 Automatically detects URL type and uses the best downloader.
30 """
32 def __init__(
33 self,
34 timeout: int = 30,
35 language: str = "English",
36 enable_js_rendering: bool = False,
37 ):
38 """
39 Initialize the content fetcher.
41 Args:
42 timeout: Request timeout in seconds
43 language: Language for justext stoplist (passed to HTML downloader)
44 enable_js_rendering: When True, the HTML/DOI downloader falls back
45 to a headless browser (Crawl4AI/Playwright) for pages that need
46 JavaScript to render. Defaults to False because the default
47 Docker production image ships without Chromium and the fallback
48 otherwise wastes work on every fetch. In limited (mostly
49 accidental) internal benchmark comparisons between dev
50 instances that happened to have Chromium installed and routine
51 Docker runs that did not, JS rendering did not measurably
52 improve research quality, and most regular benchmark runs are
53 on Docker without Chromium anyway — so disabling by default
54 does not regress observed quality. The user-facing toggle is
55 the ``web.enable_javascript_rendering`` setting.
56 """
57 self.timeout = timeout
58 self.language = language
59 self.enable_js_rendering = enable_js_rendering
60 self._downloaders: Dict[URLType, Any] = {}
62 def _get_downloader(self, url_type: URLType):
63 """Get or create the appropriate downloader for a URL type."""
64 if url_type in self._downloaders:
65 return self._downloaders[url_type]
67 downloader: Any = None
69 if url_type == URLType.ARXIV:
70 try:
71 from ..research_library.downloaders.arxiv import ArxivDownloader
73 downloader = ArxivDownloader(timeout=self.timeout)
74 except ImportError:
75 logger.warning("ArxivDownloader not available")
77 elif url_type in (URLType.PUBMED, URLType.PMC):
78 try:
79 from ..research_library.downloaders.pubmed import (
80 PubMedDownloader,
81 )
83 downloader = PubMedDownloader(timeout=self.timeout)
84 except ImportError:
85 logger.warning("PubMedDownloader not available")
87 elif url_type == URLType.SEMANTIC_SCHOLAR:
88 try:
89 from ..research_library.downloaders.semantic_scholar import (
90 SemanticScholarDownloader,
91 )
93 downloader = SemanticScholarDownloader(timeout=self.timeout)
94 except ImportError:
95 logger.warning("SemanticScholarDownloader not available")
97 elif url_type in (URLType.BIORXIV, URLType.MEDRXIV):
98 try:
99 from ..research_library.downloaders.biorxiv import (
100 BioRxivDownloader,
101 )
103 downloader = BioRxivDownloader(timeout=self.timeout)
104 except ImportError:
105 logger.warning("BioRxivDownloader not available")
107 elif url_type == URLType.PDF:
108 try:
109 from ..research_library.downloaders.direct_pdf import (
110 DirectPDFDownloader,
111 )
113 downloader = DirectPDFDownloader(timeout=self.timeout)
114 except ImportError:
115 logger.warning("DirectPDFDownloader not available")
117 elif url_type == URLType.HTML:
118 try:
119 from ..research_library.downloaders.playwright_html import (
120 AutoHTMLDownloader as HTMLDownloader,
121 )
123 downloader = HTMLDownloader(
124 timeout=self.timeout,
125 language=self.language,
126 enable_js_rendering=self.enable_js_rendering,
127 )
128 except ImportError:
129 logger.warning("HTMLDownloader not available")
131 elif url_type == URLType.DOI:
132 # DOI URLs typically redirect to publisher pages
133 # Use HTML downloader as fallback
134 try:
135 from ..research_library.downloaders.playwright_html import (
136 AutoHTMLDownloader as HTMLDownloader,
137 )
139 downloader = HTMLDownloader(
140 timeout=self.timeout,
141 language=self.language,
142 enable_js_rendering=self.enable_js_rendering,
143 )
144 except ImportError:
145 logger.warning("HTMLDownloader not available")
147 # Cache the downloader
148 if downloader:
149 self._downloaders[url_type] = downloader
151 return downloader
153 def fetch(
154 self,
155 url: str,
156 max_length: Optional[int] = None,
157 prefer_text: bool = True,
158 ) -> Dict[str, Any]:
159 """
160 Fetch content from a URL.
162 Automatically detects the URL type and uses the appropriate downloader.
164 Args:
165 url: The URL to fetch content from
166 max_length: Maximum content length to return (chars). Defaults to 500KB.
167 prefer_text: If True, prefer text extraction over PDF download
169 Returns:
170 Dict with:
171 - status: "success" or "error"
172 - content: Extracted text content
173 - url: Original URL
174 - source_type: Type of source (arxiv, pubmed, html, etc.)
175 - title: Title if available
176 - error: Error message if failed
177 """
178 # Apply default max_length if not specified
179 if max_length is None:
180 max_length = DEFAULT_MAX_CONTENT_LENGTH
182 # Classify the URL
183 url_type = URLClassifier.classify(url)
184 source_name = URLClassifier.get_source_name(url_type)
186 # Reject invalid/dangerous URLs
187 if url_type == URLType.INVALID:
188 return {
189 "status": "error",
190 "url": url,
191 "source_type": source_name,
192 "error": "Invalid or unsupported URL scheme (only http/https allowed)",
193 }
195 # SSRF validation: reject private/internal IPs before reaching downloaders
196 if not validate_url(url):
197 logger.warning(f"URL failed SSRF validation: {url}")
198 return {
199 "status": "error",
200 "url": url,
201 "source_type": source_name,
202 "error": "URL failed security validation (blocked by SSRF protection)",
203 }
205 logger.info(f"Fetching content from {url} (detected: {source_name})")
207 # Get the appropriate downloader
208 downloader = self._get_downloader(url_type)
210 if not downloader:
211 # Fall back to generic HTML downloader. This triggers when a
212 # specialized downloader (ArXiv, SemanticScholar, etc.) failed
213 # to import — playwright_html may still be available.
214 # Use _get_downloader so the instance is cached and cleaned up
215 # by close().
216 downloader = self._get_downloader(URLType.HTML)
217 if not downloader:
218 return {
219 "status": "error",
220 "url": url,
221 "source_type": source_name,
222 "error": "No suitable downloader available",
223 }
225 # Determine content type
226 content_type = ContentType.TEXT if prefer_text else ContentType.PDF
228 # Download content
229 try:
230 result = downloader.download_with_result(url, content_type)
232 # HTML fallback: when a specialized downloader fails (e.g.
233 # arXiv PDF unavailable, PubMed paywalled), try generic HTML
234 # extraction — the abstract/landing page often has useful content.
235 if not result.is_success and url_type not in _NO_HTML_FALLBACK:
236 logger.debug(
237 f"Specialized downloader failed for {url}, "
238 "trying HTML fallback"
239 )
240 html_downloader = self._get_downloader(URLType.HTML)
241 if html_downloader: 241 ↛ 250line 241 didn't jump to line 250 because the condition on line 241 was always true
242 result = html_downloader.download_with_result(
243 url, content_type
244 )
245 # Use the HTML downloader for metadata too, so we
246 # don't call the failed specialized downloader's
247 # get_metadata (which would re-fetch or return wrong data).
248 downloader = html_downloader
250 if result.is_success and result.content:
251 # Decode content — check PDF magic bytes first, then try
252 # UTF-8, and reject anything that is neither.
253 if result.content[:4] == b"%PDF":
254 from ..research_library.downloaders.base import (
255 BaseDownloader,
256 )
258 content = BaseDownloader.extract_text_from_pdf(
259 result.content
260 )
261 if not content:
262 return {
263 "status": "error",
264 "url": url,
265 "source_type": source_name,
266 "error": "Could not extract text from PDF",
267 }
268 else:
269 try:
270 content = result.content.decode("utf-8")
271 except UnicodeDecodeError:
272 return {
273 "status": "error",
274 "url": url,
275 "source_type": source_name,
276 "error": "Content is not valid UTF-8 and not a PDF",
277 }
279 # Truncate if needed
280 if max_length and len(content) > max_length:
281 content = (
282 content[:max_length] + "\n\n[... content truncated ...]"
283 )
285 # Try to get metadata
286 metadata = {}
287 if hasattr(downloader, "get_metadata"):
288 try:
289 metadata = downloader.get_metadata(url)
290 except Exception:
291 logger.debug(
292 "Failed to fetch metadata for {}",
293 url,
294 exc_info=True,
295 )
297 return {
298 "status": "success",
299 "content": content,
300 "url": url,
301 "source_type": source_name,
302 "title": metadata.get("title"),
303 "author": metadata.get("author"),
304 "published_date": metadata.get("published_date"),
305 }
307 return {
308 "status": "error",
309 "url": url,
310 "source_type": source_name,
311 "error": result.skip_reason or "Download failed",
312 }
314 except Exception as e:
315 logger.exception(f"Error fetching content from {url}")
316 return {
317 "status": "error",
318 "url": url,
319 "source_type": source_name,
320 "error": str(e),
321 }
323 def fetch_text(
324 self, url: str, max_length: Optional[int] = None
325 ) -> Optional[str]:
326 """
327 Convenience method to fetch just the text content.
329 Args:
330 url: The URL to fetch
331 max_length: Maximum content length
333 Returns:
334 Text content or None if failed
335 """
336 result = self.fetch(url, max_length=max_length, prefer_text=True)
337 if result.get("status") == "success":
338 return result.get("content")
339 return None
341 def fetch_batch(self, urls: List[str]) -> Dict[str, Optional[str]]:
342 """Fetch multiple URLs, routing each to the best downloader.
344 Specialized downloaders (arXiv, PubMed, etc.) are tried first;
345 generic HTML extraction is used as fallback. Downloaders are
346 cached by URL type, so a single Playwright browser is shared
347 across all HTML URLs.
349 Returns:
350 Dict mapping URL → extracted text (or None if failed).
351 """
352 return {url: self.fetch_text(url) for url in urls}
354 def get_url_info(self, url: str) -> Dict[str, Any]:
355 """
356 Get information about a URL without downloading.
358 Args:
359 url: The URL to analyze
361 Returns:
362 Dict with url_type, source_name, and extracted_id
363 """
364 url_type = URLClassifier.classify(url)
365 return {
366 "url": url,
367 "url_type": url_type.value,
368 "source_name": URLClassifier.get_source_name(url_type),
369 "extracted_id": URLClassifier.extract_id(url, url_type),
370 }
372 def close(self):
373 """Close all cached downloaders and their HTTP sessions."""
374 for url_type, downloader in self._downloaders.items():
375 safe_close(downloader, f"downloader-{url_type.value}")
376 self._downloaders.clear()
378 def __enter__(self):
379 return self
381 def __exit__(self, exc_type, exc_val, exc_tb):
382 self.close()
383 return False