Coverage for src / local_deep_research / content_fetcher / fetcher.py: 92%

129 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Unified Content Fetcher. 

3 

4Provides a single interface to fetch content from various sources: 

5- Academic papers (arXiv, PubMed, Semantic Scholar) 

6- Web pages (HTML) 

7- Direct PDF links 

8""" 

9 

10from typing import Any, Dict, List, Optional 

11from loguru import logger 

12 

13from .url_classifier import URLClassifier, URLType 

14from ..research_library.downloaders.base import ContentType 

15from ..security.ssrf_validator import validate_url 

16from ..utilities.resource_utils import safe_close 

17 

18# Default maximum content length (500KB of text) 

19DEFAULT_MAX_CONTENT_LENGTH = 500_000 

20 

21# URL types where HTML fallback is pointless when the specialized downloader fails 

22_NO_HTML_FALLBACK = {URLType.HTML, URLType.DOI, URLType.INVALID, URLType.PDF} 

23 

24 

25class ContentFetcher: 

26 """ 

27 Unified content fetcher that routes to appropriate downloaders. 

28 

29 Automatically detects URL type and uses the best downloader. 

30 """ 

31 

32 def __init__(self, timeout: int = 30, language: str = "English"): 

33 """ 

34 Initialize the content fetcher. 

35 

36 Args: 

37 timeout: Request timeout in seconds 

38 language: Language for justext stoplist (passed to HTML downloader) 

39 """ 

40 self.timeout = timeout 

41 self.language = language 

42 self._downloaders: Dict[URLType, Any] = {} 

43 

44 def _get_downloader(self, url_type: URLType): 

45 """Get or create the appropriate downloader for a URL type.""" 

46 if url_type in self._downloaders: 

47 return self._downloaders[url_type] 

48 

49 downloader: Any = None 

50 

51 if url_type == URLType.ARXIV: 

52 try: 

53 from ..research_library.downloaders.arxiv import ArxivDownloader 

54 

55 downloader = ArxivDownloader(timeout=self.timeout) 

56 except ImportError: 

57 logger.warning("ArxivDownloader not available") 

58 

59 elif url_type in (URLType.PUBMED, URLType.PMC): 

60 try: 

61 from ..research_library.downloaders.pubmed import ( 

62 PubMedDownloader, 

63 ) 

64 

65 downloader = PubMedDownloader(timeout=self.timeout) 

66 except ImportError: 

67 logger.warning("PubMedDownloader not available") 

68 

69 elif url_type == URLType.SEMANTIC_SCHOLAR: 

70 try: 

71 from ..research_library.downloaders.semantic_scholar import ( 

72 SemanticScholarDownloader, 

73 ) 

74 

75 downloader = SemanticScholarDownloader(timeout=self.timeout) 

76 except ImportError: 

77 logger.warning("SemanticScholarDownloader not available") 

78 

79 elif url_type in (URLType.BIORXIV, URLType.MEDRXIV): 

80 try: 

81 from ..research_library.downloaders.biorxiv import ( 

82 BioRxivDownloader, 

83 ) 

84 

85 downloader = BioRxivDownloader(timeout=self.timeout) 

86 except ImportError: 

87 logger.warning("BioRxivDownloader not available") 

88 

89 elif url_type == URLType.PDF: 

90 try: 

91 from ..research_library.downloaders.direct_pdf import ( 

92 DirectPDFDownloader, 

93 ) 

94 

95 downloader = DirectPDFDownloader(timeout=self.timeout) 

96 except ImportError: 

97 logger.warning("DirectPDFDownloader not available") 

98 

99 elif url_type == URLType.HTML: 

100 try: 

101 from ..research_library.downloaders.playwright_html import ( 

102 AutoHTMLDownloader as HTMLDownloader, 

103 ) 

104 

105 downloader = HTMLDownloader( 

106 timeout=self.timeout, language=self.language 

107 ) 

108 except ImportError: 

109 logger.warning("HTMLDownloader not available") 

110 

111 elif url_type == URLType.DOI: 

112 # DOI URLs typically redirect to publisher pages 

113 # Use HTML downloader as fallback 

114 try: 

115 from ..research_library.downloaders.playwright_html import ( 

116 AutoHTMLDownloader as HTMLDownloader, 

117 ) 

118 

119 downloader = HTMLDownloader( 

120 timeout=self.timeout, language=self.language 

121 ) 

122 except ImportError: 

123 logger.warning("HTMLDownloader not available") 

124 

125 # Cache the downloader 

126 if downloader: 

127 self._downloaders[url_type] = downloader 

128 

129 return downloader 

130 

131 def fetch( 

132 self, 

133 url: str, 

134 max_length: Optional[int] = None, 

135 prefer_text: bool = True, 

136 ) -> Dict[str, Any]: 

137 """ 

138 Fetch content from a URL. 

139 

140 Automatically detects the URL type and uses the appropriate downloader. 

141 

142 Args: 

143 url: The URL to fetch content from 

144 max_length: Maximum content length to return (chars). Defaults to 500KB. 

145 prefer_text: If True, prefer text extraction over PDF download 

146 

147 Returns: 

148 Dict with: 

149 - status: "success" or "error" 

150 - content: Extracted text content 

151 - url: Original URL 

152 - source_type: Type of source (arxiv, pubmed, html, etc.) 

153 - title: Title if available 

154 - error: Error message if failed 

155 """ 

156 # Apply default max_length if not specified 

157 if max_length is None: 

158 max_length = DEFAULT_MAX_CONTENT_LENGTH 

159 

160 # Classify the URL 

161 url_type = URLClassifier.classify(url) 

162 source_name = URLClassifier.get_source_name(url_type) 

163 

164 # Reject invalid/dangerous URLs 

165 if url_type == URLType.INVALID: 

166 return { 

167 "status": "error", 

168 "url": url, 

169 "source_type": source_name, 

170 "error": "Invalid or unsupported URL scheme (only http/https allowed)", 

171 } 

172 

173 # SSRF validation: reject private/internal IPs before reaching downloaders 

174 if not validate_url(url): 

175 logger.warning(f"URL failed SSRF validation: {url}") 

176 return { 

177 "status": "error", 

178 "url": url, 

179 "source_type": source_name, 

180 "error": "URL failed security validation (blocked by SSRF protection)", 

181 } 

182 

183 logger.info(f"Fetching content from {url} (detected: {source_name})") 

184 

185 # Get the appropriate downloader 

186 downloader = self._get_downloader(url_type) 

187 

188 if not downloader: 

189 # Fall back to generic HTML downloader. This triggers when a 

190 # specialized downloader (ArXiv, SemanticScholar, etc.) failed 

191 # to import — playwright_html may still be available. 

192 # Use _get_downloader so the instance is cached and cleaned up 

193 # by close(). 

194 downloader = self._get_downloader(URLType.HTML) 

195 if not downloader: 

196 return { 

197 "status": "error", 

198 "url": url, 

199 "source_type": source_name, 

200 "error": "No suitable downloader available", 

201 } 

202 

203 # Determine content type 

204 content_type = ContentType.TEXT if prefer_text else ContentType.PDF 

205 

206 # Download content 

207 try: 

208 result = downloader.download_with_result(url, content_type) 

209 

210 # HTML fallback: when a specialized downloader fails (e.g. 

211 # arXiv PDF unavailable, PubMed paywalled), try generic HTML 

212 # extraction — the abstract/landing page often has useful content. 

213 if not result.is_success and url_type not in _NO_HTML_FALLBACK: 

214 logger.debug( 

215 f"Specialized downloader failed for {url}, " 

216 "trying HTML fallback" 

217 ) 

218 html_downloader = self._get_downloader(URLType.HTML) 

219 if html_downloader: 219 ↛ 228line 219 didn't jump to line 228 because the condition on line 219 was always true

220 result = html_downloader.download_with_result( 

221 url, content_type 

222 ) 

223 # Use the HTML downloader for metadata too, so we 

224 # don't call the failed specialized downloader's 

225 # get_metadata (which would re-fetch or return wrong data). 

226 downloader = html_downloader 

227 

228 if result.is_success and result.content: 

229 # Decode content — check PDF magic bytes first, then try 

230 # UTF-8, and reject anything that is neither. 

231 if result.content[:4] == b"%PDF": 

232 from ..research_library.downloaders.base import ( 

233 BaseDownloader, 

234 ) 

235 

236 content = BaseDownloader.extract_text_from_pdf( 

237 result.content 

238 ) 

239 if not content: 

240 return { 

241 "status": "error", 

242 "url": url, 

243 "source_type": source_name, 

244 "error": "Could not extract text from PDF", 

245 } 

246 else: 

247 try: 

248 content = result.content.decode("utf-8") 

249 except UnicodeDecodeError: 

250 return { 

251 "status": "error", 

252 "url": url, 

253 "source_type": source_name, 

254 "error": "Content is not valid UTF-8 and not a PDF", 

255 } 

256 

257 # Truncate if needed 

258 if max_length and len(content) > max_length: 

259 content = ( 

260 content[:max_length] + "\n\n[... content truncated ...]" 

261 ) 

262 

263 # Try to get metadata 

264 metadata = {} 

265 if hasattr(downloader, "get_metadata"): 

266 try: 

267 metadata = downloader.get_metadata(url) 

268 except Exception: 

269 logger.debug( 

270 "Failed to fetch metadata for {}", 

271 url, 

272 exc_info=True, 

273 ) 

274 

275 return { 

276 "status": "success", 

277 "content": content, 

278 "url": url, 

279 "source_type": source_name, 

280 "title": metadata.get("title"), 

281 "author": metadata.get("author"), 

282 "published_date": metadata.get("published_date"), 

283 } 

284 

285 return { 

286 "status": "error", 

287 "url": url, 

288 "source_type": source_name, 

289 "error": result.skip_reason or "Download failed", 

290 } 

291 

292 except Exception as e: 

293 logger.exception(f"Error fetching content from {url}") 

294 return { 

295 "status": "error", 

296 "url": url, 

297 "source_type": source_name, 

298 "error": str(e), 

299 } 

300 

301 def fetch_text( 

302 self, url: str, max_length: Optional[int] = None 

303 ) -> Optional[str]: 

304 """ 

305 Convenience method to fetch just the text content. 

306 

307 Args: 

308 url: The URL to fetch 

309 max_length: Maximum content length 

310 

311 Returns: 

312 Text content or None if failed 

313 """ 

314 result = self.fetch(url, max_length=max_length, prefer_text=True) 

315 if result.get("status") == "success": 

316 return result.get("content") 

317 return None 

318 

319 def fetch_batch(self, urls: List[str]) -> Dict[str, Optional[str]]: 

320 """Fetch multiple URLs, routing each to the best downloader. 

321 

322 Specialized downloaders (arXiv, PubMed, etc.) are tried first; 

323 generic HTML extraction is used as fallback. Downloaders are 

324 cached by URL type, so a single Playwright browser is shared 

325 across all HTML URLs. 

326 

327 Returns: 

328 Dict mapping URL → extracted text (or None if failed). 

329 """ 

330 return {url: self.fetch_text(url) for url in urls} 

331 

332 def get_url_info(self, url: str) -> Dict[str, Any]: 

333 """ 

334 Get information about a URL without downloading. 

335 

336 Args: 

337 url: The URL to analyze 

338 

339 Returns: 

340 Dict with url_type, source_name, and extracted_id 

341 """ 

342 url_type = URLClassifier.classify(url) 

343 return { 

344 "url": url, 

345 "url_type": url_type.value, 

346 "source_name": URLClassifier.get_source_name(url_type), 

347 "extracted_id": URLClassifier.extract_id(url, url_type), 

348 } 

349 

350 def close(self): 

351 """Close all cached downloaders and their HTTP sessions.""" 

352 for url_type, downloader in self._downloaders.items(): 

353 safe_close(downloader, f"downloader-{url_type.value}") 

354 self._downloaders.clear() 

355 

356 def __enter__(self): 

357 return self 

358 

359 def __exit__(self, exc_type, exc_val, exc_tb): 

360 self.close() 

361 return False