Coverage for src/local_deep_research/content_fetcher/fetcher.py: 97%

130 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Unified Content Fetcher. 

3 

4Provides a single interface to fetch content from various sources: 

5- Academic papers (arXiv, PubMed, Semantic Scholar) 

6- Web pages (HTML) 

7- Direct PDF links 

8""" 

9 

10from typing import Any, Dict, List, Optional 

11from loguru import logger 

12 

13from .url_classifier import URLClassifier, URLType 

14from ..research_library.downloaders.base import ContentType 

15from ..security.ssrf_validator import validate_url 

16from ..utilities.resource_utils import safe_close 

17 

18# Default maximum content length (500KB of text) 

19DEFAULT_MAX_CONTENT_LENGTH = 500_000 

20 

21# URL types where HTML fallback is pointless when the specialized downloader fails 

22_NO_HTML_FALLBACK = {URLType.HTML, URLType.DOI, URLType.INVALID, URLType.PDF} 

23 

24 

25class ContentFetcher: 

26 """ 

27 Unified content fetcher that routes to appropriate downloaders. 

28 

29 Automatically detects URL type and uses the best downloader. 

30 """ 

31 

32 def __init__( 

33 self, 

34 timeout: int = 30, 

35 language: str = "English", 

36 enable_js_rendering: bool = False, 

37 ): 

38 """ 

39 Initialize the content fetcher. 

40 

41 Args: 

42 timeout: Request timeout in seconds 

43 language: Language for justext stoplist (passed to HTML downloader) 

44 enable_js_rendering: When True, the HTML/DOI downloader falls back 

45 to a headless browser (Crawl4AI/Playwright) for pages that need 

46 JavaScript to render. Defaults to False because the default 

47 Docker production image ships without Chromium and the fallback 

48 otherwise wastes work on every fetch. In limited (mostly 

49 accidental) internal benchmark comparisons between dev 

50 instances that happened to have Chromium installed and routine 

51 Docker runs that did not, JS rendering did not measurably 

52 improve research quality, and most regular benchmark runs are 

53 on Docker without Chromium anyway — so disabling by default 

54 does not regress observed quality. The user-facing toggle is 

55 the ``web.enable_javascript_rendering`` setting. 

56 """ 

57 self.timeout = timeout 

58 self.language = language 

59 self.enable_js_rendering = enable_js_rendering 

60 self._downloaders: Dict[URLType, Any] = {} 

61 

62 def _get_downloader(self, url_type: URLType): 

63 """Get or create the appropriate downloader for a URL type.""" 

64 if url_type in self._downloaders: 

65 return self._downloaders[url_type] 

66 

67 downloader: Any = None 

68 

69 if url_type == URLType.ARXIV: 

70 try: 

71 from ..research_library.downloaders.arxiv import ArxivDownloader 

72 

73 downloader = ArxivDownloader(timeout=self.timeout) 

74 except ImportError: 

75 logger.warning("ArxivDownloader not available") 

76 

77 elif url_type in (URLType.PUBMED, URLType.PMC): 

78 try: 

79 from ..research_library.downloaders.pubmed import ( 

80 PubMedDownloader, 

81 ) 

82 

83 downloader = PubMedDownloader(timeout=self.timeout) 

84 except ImportError: 

85 logger.warning("PubMedDownloader not available") 

86 

87 elif url_type == URLType.SEMANTIC_SCHOLAR: 

88 try: 

89 from ..research_library.downloaders.semantic_scholar import ( 

90 SemanticScholarDownloader, 

91 ) 

92 

93 downloader = SemanticScholarDownloader(timeout=self.timeout) 

94 except ImportError: 

95 logger.warning("SemanticScholarDownloader not available") 

96 

97 elif url_type in (URLType.BIORXIV, URLType.MEDRXIV): 

98 try: 

99 from ..research_library.downloaders.biorxiv import ( 

100 BioRxivDownloader, 

101 ) 

102 

103 downloader = BioRxivDownloader(timeout=self.timeout) 

104 except ImportError: 

105 logger.warning("BioRxivDownloader not available") 

106 

107 elif url_type == URLType.PDF: 

108 try: 

109 from ..research_library.downloaders.direct_pdf import ( 

110 DirectPDFDownloader, 

111 ) 

112 

113 downloader = DirectPDFDownloader(timeout=self.timeout) 

114 except ImportError: 

115 logger.warning("DirectPDFDownloader not available") 

116 

117 elif url_type == URLType.HTML: 

118 try: 

119 from ..research_library.downloaders.playwright_html import ( 

120 AutoHTMLDownloader as HTMLDownloader, 

121 ) 

122 

123 downloader = HTMLDownloader( 

124 timeout=self.timeout, 

125 language=self.language, 

126 enable_js_rendering=self.enable_js_rendering, 

127 ) 

128 except ImportError: 

129 logger.warning("HTMLDownloader not available") 

130 

131 elif url_type == URLType.DOI: 

132 # DOI URLs typically redirect to publisher pages 

133 # Use HTML downloader as fallback 

134 try: 

135 from ..research_library.downloaders.playwright_html import ( 

136 AutoHTMLDownloader as HTMLDownloader, 

137 ) 

138 

139 downloader = HTMLDownloader( 

140 timeout=self.timeout, 

141 language=self.language, 

142 enable_js_rendering=self.enable_js_rendering, 

143 ) 

144 except ImportError: 

145 logger.warning("HTMLDownloader not available") 

146 

147 # Cache the downloader 

148 if downloader: 

149 self._downloaders[url_type] = downloader 

150 

151 return downloader 

152 

153 def fetch( 

154 self, 

155 url: str, 

156 max_length: Optional[int] = None, 

157 prefer_text: bool = True, 

158 ) -> Dict[str, Any]: 

159 """ 

160 Fetch content from a URL. 

161 

162 Automatically detects the URL type and uses the appropriate downloader. 

163 

164 Args: 

165 url: The URL to fetch content from 

166 max_length: Maximum content length to return (chars). Defaults to 500KB. 

167 prefer_text: If True, prefer text extraction over PDF download 

168 

169 Returns: 

170 Dict with: 

171 - status: "success" or "error" 

172 - content: Extracted text content 

173 - url: Original URL 

174 - source_type: Type of source (arxiv, pubmed, html, etc.) 

175 - title: Title if available 

176 - error: Error message if failed 

177 """ 

178 # Apply default max_length if not specified 

179 if max_length is None: 

180 max_length = DEFAULT_MAX_CONTENT_LENGTH 

181 

182 # Classify the URL 

183 url_type = URLClassifier.classify(url) 

184 source_name = URLClassifier.get_source_name(url_type) 

185 

186 # Reject invalid/dangerous URLs 

187 if url_type == URLType.INVALID: 

188 return { 

189 "status": "error", 

190 "url": url, 

191 "source_type": source_name, 

192 "error": "Invalid or unsupported URL scheme (only http/https allowed)", 

193 } 

194 

195 # SSRF validation: reject private/internal IPs before reaching downloaders 

196 if not validate_url(url): 

197 logger.warning(f"URL failed SSRF validation: {url}") 

198 return { 

199 "status": "error", 

200 "url": url, 

201 "source_type": source_name, 

202 "error": "URL failed security validation (blocked by SSRF protection)", 

203 } 

204 

205 logger.info(f"Fetching content from {url} (detected: {source_name})") 

206 

207 # Get the appropriate downloader 

208 downloader = self._get_downloader(url_type) 

209 

210 if not downloader: 

211 # Fall back to generic HTML downloader. This triggers when a 

212 # specialized downloader (ArXiv, SemanticScholar, etc.) failed 

213 # to import — playwright_html may still be available. 

214 # Use _get_downloader so the instance is cached and cleaned up 

215 # by close(). 

216 downloader = self._get_downloader(URLType.HTML) 

217 if not downloader: 

218 return { 

219 "status": "error", 

220 "url": url, 

221 "source_type": source_name, 

222 "error": "No suitable downloader available", 

223 } 

224 

225 # Determine content type 

226 content_type = ContentType.TEXT if prefer_text else ContentType.PDF 

227 

228 # Download content 

229 try: 

230 result = downloader.download_with_result(url, content_type) 

231 

232 # HTML fallback: when a specialized downloader fails (e.g. 

233 # arXiv PDF unavailable, PubMed paywalled), try generic HTML 

234 # extraction — the abstract/landing page often has useful content. 

235 if not result.is_success and url_type not in _NO_HTML_FALLBACK: 

236 logger.debug( 

237 f"Specialized downloader failed for {url}, " 

238 "trying HTML fallback" 

239 ) 

240 html_downloader = self._get_downloader(URLType.HTML) 

241 if html_downloader: 241 ↛ 250line 241 didn't jump to line 250 because the condition on line 241 was always true

242 result = html_downloader.download_with_result( 

243 url, content_type 

244 ) 

245 # Use the HTML downloader for metadata too, so we 

246 # don't call the failed specialized downloader's 

247 # get_metadata (which would re-fetch or return wrong data). 

248 downloader = html_downloader 

249 

250 if result.is_success and result.content: 

251 # Decode content — check PDF magic bytes first, then try 

252 # UTF-8, and reject anything that is neither. 

253 if result.content[:4] == b"%PDF": 

254 from ..research_library.downloaders.base import ( 

255 BaseDownloader, 

256 ) 

257 

258 content = BaseDownloader.extract_text_from_pdf( 

259 result.content 

260 ) 

261 if not content: 

262 return { 

263 "status": "error", 

264 "url": url, 

265 "source_type": source_name, 

266 "error": "Could not extract text from PDF", 

267 } 

268 else: 

269 try: 

270 content = result.content.decode("utf-8") 

271 except UnicodeDecodeError: 

272 return { 

273 "status": "error", 

274 "url": url, 

275 "source_type": source_name, 

276 "error": "Content is not valid UTF-8 and not a PDF", 

277 } 

278 

279 # Truncate if needed 

280 if max_length and len(content) > max_length: 

281 content = ( 

282 content[:max_length] + "\n\n[... content truncated ...]" 

283 ) 

284 

285 # Try to get metadata 

286 metadata = {} 

287 if hasattr(downloader, "get_metadata"): 

288 try: 

289 metadata = downloader.get_metadata(url) 

290 except Exception: 

291 logger.debug( 

292 "Failed to fetch metadata for {}", 

293 url, 

294 exc_info=True, 

295 ) 

296 

297 return { 

298 "status": "success", 

299 "content": content, 

300 "url": url, 

301 "source_type": source_name, 

302 "title": metadata.get("title"), 

303 "author": metadata.get("author"), 

304 "published_date": metadata.get("published_date"), 

305 } 

306 

307 return { 

308 "status": "error", 

309 "url": url, 

310 "source_type": source_name, 

311 "error": result.skip_reason or "Download failed", 

312 } 

313 

314 except Exception as e: 

315 logger.exception(f"Error fetching content from {url}") 

316 return { 

317 "status": "error", 

318 "url": url, 

319 "source_type": source_name, 

320 "error": str(e), 

321 } 

322 

323 def fetch_text( 

324 self, url: str, max_length: Optional[int] = None 

325 ) -> Optional[str]: 

326 """ 

327 Convenience method to fetch just the text content. 

328 

329 Args: 

330 url: The URL to fetch 

331 max_length: Maximum content length 

332 

333 Returns: 

334 Text content or None if failed 

335 """ 

336 result = self.fetch(url, max_length=max_length, prefer_text=True) 

337 if result.get("status") == "success": 

338 return result.get("content") 

339 return None 

340 

341 def fetch_batch(self, urls: List[str]) -> Dict[str, Optional[str]]: 

342 """Fetch multiple URLs, routing each to the best downloader. 

343 

344 Specialized downloaders (arXiv, PubMed, etc.) are tried first; 

345 generic HTML extraction is used as fallback. Downloaders are 

346 cached by URL type, so a single Playwright browser is shared 

347 across all HTML URLs. 

348 

349 Returns: 

350 Dict mapping URL → extracted text (or None if failed). 

351 """ 

352 return {url: self.fetch_text(url) for url in urls} 

353 

354 def get_url_info(self, url: str) -> Dict[str, Any]: 

355 """ 

356 Get information about a URL without downloading. 

357 

358 Args: 

359 url: The URL to analyze 

360 

361 Returns: 

362 Dict with url_type, source_name, and extracted_id 

363 """ 

364 url_type = URLClassifier.classify(url) 

365 return { 

366 "url": url, 

367 "url_type": url_type.value, 

368 "source_name": URLClassifier.get_source_name(url_type), 

369 "extracted_id": URLClassifier.extract_id(url, url_type), 

370 } 

371 

372 def close(self): 

373 """Close all cached downloaders and their HTTP sessions.""" 

374 for url_type, downloader in self._downloaders.items(): 

375 safe_close(downloader, f"downloader-{url_type.value}") 

376 self._downloaders.clear() 

377 

378 def __enter__(self): 

379 return self 

380 

381 def __exit__(self, exc_type, exc_val, exc_tb): 

382 self.close() 

383 return False