Coverage for src/local_deep_research/content

1"""

2Unified Content Fetcher.

4Provides a single interface to fetch content from various sources:

5- Academic papers (arXiv, PubMed, Semantic Scholar)

6- Web pages (HTML)

7- Direct PDF links

8"""

10from typing import Any, Dict, List, Optional

11from loguru import logger

13from .url_classifier import URLClassifier, URLType

14from ..research_library.downloaders.base import ContentType

15from ..security.egress.fetch import policy_aware_validate_url

16from ..utilities.resource_utils import safe_close

18# Default maximum content length (500KB of text)

19DEFAULT_MAX_CONTENT_LENGTH = 500_000

21# URL types where HTML fallback is pointless when the specialized downloader fails

22_NO_HTML_FALLBACK = {URLType.HTML, URLType.DOI, URLType.INVALID, URLType.PDF}

25class ContentFetcher:

26 """

27 Unified content fetcher that routes to appropriate downloaders.

29 Automatically detects URL type and uses the best downloader.

30 """

32 def __init__(

33 self,

34 timeout: int = 30,

35 language: str = "English",

36 enable_js_rendering: bool = False,

37 egress_context: Any = None,

38 ):

39 """

40 Initialize the content fetcher.

42 Args:

43 timeout: Request timeout in seconds

44 language: Language for justext stoplist (passed to HTML downloader)

45 enable_js_rendering: When True, the HTML/DOI downloader falls back

46 to a headless browser (Crawl4AI/Playwright) for pages that need

47 JavaScript to render. Defaults to False because the default

48 Docker production image ships without Chromium and the fallback

49 otherwise wastes work on every fetch. In limited (mostly

50 accidental) internal benchmark comparisons between dev

51 instances that happened to have Chromium installed and routine

52 Docker runs that did not, JS rendering did not measurably

53 improve research quality, and most regular benchmark runs are

54 on Docker without Chromium anyway — so disabling by default

55 does not regress observed quality. The user-facing toggle is

56 the ``web.enable_javascript_rendering`` setting.

57 egress_context: Optional ``EgressContext`` from the active

58 policy. When present and the run's scope is

59 ``PRIVATE_ONLY``, SSRF validation permits private IPs so

60 local lab deployments (Ollama at 127.0.0.1, SearXNG on

61 192.168.x) can actually be reached without forcing the

62 operator to set SSRF_ALLOW_PRIVATE_IPS=1 globally.

63 """

64 self.timeout = timeout

65 self.language = language

66 self.enable_js_rendering = enable_js_rendering

67 self.egress_context = egress_context

68 self._downloaders: Dict[URLType, Any] = {}

70 def _get_downloader(self, url_type: URLType):

71 """Get or create the appropriate downloader for a URL type."""

72 if url_type in self._downloaders:

73 return self._downloaders[url_type]

75 downloader: Any = None

77 if url_type == URLType.ARXIV:

78 try:

79 from ..research_library.downloaders.arxiv import ArxivDownloader

81 downloader = ArxivDownloader(timeout=self.timeout)

82 except ImportError:

83 logger.warning("ArxivDownloader not available")

85 elif url_type in (URLType.PUBMED, URLType.PMC):

86 try:

87 from ..research_library.downloaders.pubmed import (

88 PubMedDownloader,

89 )

91 downloader = PubMedDownloader(timeout=self.timeout)

92 except ImportError:

93 logger.warning("PubMedDownloader not available")

95 elif url_type == URLType.SEMANTIC_SCHOLAR:

96 try:

97 from ..research_library.downloaders.semantic_scholar import (

98 SemanticScholarDownloader,

99 )

100

101 downloader = SemanticScholarDownloader(timeout=self.timeout)

102 except ImportError:

103 logger.warning("SemanticScholarDownloader not available")

104

105 elif url_type in (URLType.BIORXIV, URLType.MEDRXIV):

106 try:

107 from ..research_library.downloaders.biorxiv import (

108 BioRxivDownloader,

109 )

110

111 downloader = BioRxivDownloader(timeout=self.timeout)

112 except ImportError:

113 logger.warning("BioRxivDownloader not available")

114

115 elif url_type == URLType.PDF:

116 try:

117 from ..research_library.downloaders.direct_pdf import (

118 DirectPDFDownloader,

119 )

120

121 downloader = DirectPDFDownloader(timeout=self.timeout)

122 except ImportError:

123 logger.warning("DirectPDFDownloader not available")

124

125 elif url_type == URLType.HTML:

126 try:

127 from ..research_library.downloaders.playwright_html import (

128 AutoHTMLDownloader as HTMLDownloader,

129 )

130

131 downloader = HTMLDownloader(

132 timeout=self.timeout,

133 language=self.language,

134 enable_js_rendering=self.enable_js_rendering,

135 )

136 except ImportError:

137 logger.warning("HTMLDownloader not available")

138

139 elif url_type == URLType.DOI:

140 # DOI URLs typically redirect to publisher pages

141 # Use HTML downloader as fallback

142 try:

143 from ..research_library.downloaders.playwright_html import (

144 AutoHTMLDownloader as HTMLDownloader,

145 )

146

147 downloader = HTMLDownloader(

148 timeout=self.timeout,

149 language=self.language,

150 enable_js_rendering=self.enable_js_rendering,

151 )

152 except ImportError:

153 logger.warning("HTMLDownloader not available")

154

155 # Cache the downloader

156 if downloader:

157 self._apply_egress_policy_to_downloader(downloader)

158 self._downloaders[url_type] = downloader

159

160 return downloader

161

162 def _apply_egress_policy_to_downloader(self, downloader: Any) -> None:

163 """Relax a downloader's SafeSession to allow private IPs when the

164 active scope is PRIVATE_ONLY, mirroring ``policy_aware_validate_url``.

165

166 Without this, a private/lab URL that ContentFetcher already approved

167 (policy_aware_validate_url + evaluate_url both allow private hosts

168 under PRIVATE_ONLY) is then rejected by the downloader's OWN strict

169 SafeSession SSRF re-validation (allow_private_ips defaults to False),

170 breaking PRIVATE_ONLY's documented "reach your local services" use

171 case. ``SafeSession.request`` reads ``allow_private_ips`` per-request,

172 so setting it post-construction takes effect. Cloud-metadata IPs stay

173 blocked regardless (``is_ip_blocked`` always rejects them).

174 """

175 if downloader is None or self.egress_context is None:

176 return

177 # Narrow except (not bare Exception): the only things that can throw

178 # here are the policy import and the ctx.scope attribute access. The

179 # failure direction is fail-SAFE — if we don't set allow_private_ips it

180 # stays False (strict SSRF), i.e. over-restrictive, never a bypass — but

181 # catching narrowly avoids masking an unrelated bug (e.g. a refactor

182 # that breaks the import) while still never breaking a fetch on the

183 # expected misconfiguration cases.

184 try:

185 from ..security.egress.policy import EgressScope

186

187 scope = self.egress_context.scope

188 except (ImportError, AttributeError): # pragma: no cover - defensive

189 logger.debug(

190 "could not resolve egress scope for downloader session",

191 exc_info=True,

192 )

193 return

194 if scope == EgressScope.PRIVATE_ONLY:

195 session = getattr(downloader, "session", None)

196 if session is not None and hasattr(session, "allow_private_ips"):

197 session.allow_private_ips = True

198

199 def fetch(

200 self,

201 url: str,

202 max_length: Optional[int] = None,

203 prefer_text: bool = True,

204 ) -> Dict[str, Any]:

205 """

206 Fetch content from a URL.

207

208 Automatically detects the URL type and uses the appropriate downloader.

209

210 Args:

211 url: The URL to fetch content from

212 max_length: Maximum content length to return (chars). Defaults to 500KB.

213 prefer_text: If True, prefer text extraction over PDF download

214

215 Returns:

216 Dict with:

217 - status: "success" or "error"

218 - content: Extracted text content

219 - url: Original URL

220 - source_type: Type of source (arxiv, pubmed, html, etc.)

221 - title: Title if available

222 - error: Error message if failed

223 """

224 # Apply default max_length if not specified

225 if max_length is None:

226 max_length = DEFAULT_MAX_CONTENT_LENGTH

227

228 # Classify the URL

229 url_type = URLClassifier.classify(url)

230 source_name = URLClassifier.get_source_name(url_type)

231

232 # Reject invalid/dangerous URLs

233 if url_type == URLType.INVALID:

234 return {

235 "status": "error",

236 "url": url,

237 "source_type": source_name,

238 "error": "Invalid or unsupported URL scheme (only http/https allowed)",

239 }

240

241 # SSRF validation: reject private/internal IPs before reaching downloaders.

242 # Policy-aware so PRIVATE_ONLY egress scope can actually reach

243 # private hosts (lab deployments) without disabling SSRF globally.

244 if not policy_aware_validate_url(url, self.egress_context):

245 logger.warning(f"URL failed SSRF validation: {url}")

246 return {

247 "status": "error",

248 "url": url,

249 "source_type": source_name,

250 "error": "URL failed security validation (blocked by SSRF protection)",

251 }

252

253 # Egress policy: reject URLs that are SSRF-OK but scope-incompatible.

254 # SSRF only checks the IP class; scope enforcement is a separate axis

255 # (PRIVATE_ONLY blocks public hosts, PUBLIC_ONLY blocks private hosts).

256 # Centralized here so all callers (LangGraph fetch tool, MCP

257 # download_content, future tools) get uniform enforcement instead of

258 # each remembering to wrap the call site.

259 if self.egress_context is not None:

260 from ..security.egress.policy import evaluate_url

261 from ..security.ssrf_validator import redact_url_for_log

262

263 url_decision = evaluate_url(url, self.egress_context)

264 if not url_decision.allowed:

265 logger.bind(policy_audit=True).warning(

266 "fetch URL denied by egress policy",

267 # Redact: a denied URL may carry userinfo creds / API-key

268 # query params; log only scheme://host:port.

269 url=redact_url_for_log(url),

270 scope=self.egress_context.scope.value,

271 reason=url_decision.reason,

272 )

273 # Return a structured error rather than raise — callers in

274 # the fetch path already handle dict-shaped errors.

275 return {

276 "status": "error",

277 "url": url,

278 "source_type": source_name,

279 "error": (

280 f"URL refused by egress policy ({url_decision.reason})"

281 ),

282 }

283

284 logger.info(f"Fetching content from {url} (detected: {source_name})")

285

286 # Get the appropriate downloader

287 downloader = self._get_downloader(url_type)

288

289 if not downloader:

290 # Fall back to generic HTML downloader. This triggers when a

291 # specialized downloader (ArXiv, SemanticScholar, etc.) failed

292 # to import — playwright_html may still be available.

293 # Use _get_downloader so the instance is cached and cleaned up

294 # by close().

295 downloader = self._get_downloader(URLType.HTML)

296 if not downloader:

297 return {

298 "status": "error",

299 "url": url,

300 "source_type": source_name,

301 "error": "No suitable downloader available",

302 }

303

304 # Determine content type

305 content_type = ContentType.TEXT if prefer_text else ContentType.PDF

306

307 # Download content

308 try:

309 result = downloader.download_with_result(url, content_type)

310

311 # HTML fallback: when a specialized downloader fails (e.g.

312 # arXiv PDF unavailable, PubMed paywalled), try generic HTML

313 # extraction — the abstract/landing page often has useful content.

314 if not result.is_success and url_type not in _NO_HTML_FALLBACK:

315 logger.debug(

316 f"Specialized downloader failed for {url}, "

317 "trying HTML fallback"

318 )

319 html_downloader = self._get_downloader(URLType.HTML)

320 if html_downloader: 320 ↛ 329line 320 didn't jump to line 329 because the condition on line 320 was always true

321 result = html_downloader.download_with_result(

322 url, content_type

323 )

324 # Use the HTML downloader for metadata too, so we

325 # don't call the failed specialized downloader's

326 # get_metadata (which would re-fetch or return wrong data).

327 downloader = html_downloader

328

329 if result.is_success and result.content:

330 # Decode content — check PDF magic bytes first, then try

331 # UTF-8, and reject anything that is neither.

332 if result.content[:4] == b"%PDF":

333 from ..research_library.downloaders.base import (

334 BaseDownloader,

335 )

336

337 content = BaseDownloader.extract_text_from_pdf(

338 result.content

339 )

340 if not content:

341 return {

342 "status": "error",

343 "url": url,

344 "source_type": source_name,

345 "error": "Could not extract text from PDF",

346 }

347 else:

348 try:

349 content = result.content.decode("utf-8")

350 except UnicodeDecodeError:

351 return {

352 "status": "error",

353 "url": url,

354 "source_type": source_name,

355 "error": "Content is not valid UTF-8 and not a PDF",

356 }

357

358 # Truncate if needed

359 if max_length and len(content) > max_length:

360 content = (

361 content[:max_length] + "\n\n[... content truncated ...]"

362 )

363

364 # Try to get metadata

365 metadata = {}

366 if hasattr(downloader, "get_metadata"):

367 try:

368 metadata = downloader.get_metadata(url)

369 except Exception:

370 logger.debug(

371 "Failed to fetch metadata for {}",

372 url,

373 exc_info=True,

374 )

375

376 return {

377 "status": "success",

378 "content": content,

379 "url": url,

380 "source_type": source_name,

381 "title": metadata.get("title"),

382 "author": metadata.get("author"),

383 "published_date": metadata.get("published_date"),

384 }

385

386 return {

387 "status": "error",

388 "url": url,

389 "source_type": source_name,

390 "error": result.skip_reason or "Download failed",

391 }

392

393 except Exception as e:

394 logger.exception(f"Error fetching content from {url}")

395 return {

396 "status": "error",

397 "url": url,

398 "source_type": source_name,

399 "error": str(e),

400 }

401

402 def fetch_text(

403 self, url: str, max_length: Optional[int] = None

404 ) -> Optional[str]:

405 """

406 Convenience method to fetch just the text content.

407

408 Args:

409 url: The URL to fetch

410 max_length: Maximum content length

411

412 Returns:

413 Text content or None if failed

414 """

415 result = self.fetch(url, max_length=max_length, prefer_text=True)

416 if result.get("status") == "success":

417 return result.get("content")

418 return None

419

420 def fetch_batch(self, urls: List[str]) -> Dict[str, Optional[str]]:

421 """Fetch multiple URLs, routing each to the best downloader.

422

423 Specialized downloaders (arXiv, PubMed, etc.) are tried first;

424 generic HTML extraction is used as fallback. Downloaders are

425 cached by URL type, so a single Playwright browser is shared

426 across all HTML URLs.

427

428 Returns:

429 Dict mapping URL → extracted text (or None if failed).

430 """

431 return {url: self.fetch_text(url) for url in urls}

432

433 def get_url_info(self, url: str) -> Dict[str, Any]:

434 """

435 Get information about a URL without downloading.

436

437 Args:

438 url: The URL to analyze

439

440 Returns:

441 Dict with url_type, source_name, and extracted_id

442 """

443 url_type = URLClassifier.classify(url)

444 return {

445 "url": url,

446 "url_type": url_type.value,

447 "source_name": URLClassifier.get_source_name(url_type),

448 "extracted_id": URLClassifier.extract_id(url, url_type),

449 }

450

451 def close(self):

452 """Close all cached downloaders and their HTTP sessions."""

453 for url_type, downloader in self._downloaders.items():

454 safe_close(downloader, f"downloader-{url_type.value}")

455 self._downloaders.clear()

456

457 def __enter__(self):

458 return self

459

460 def __exit__(self, exc_type, exc_val, exc_tb):

461 self.close()

462 return False

Coverage for src/local_deep_research/content_fetcher/fetcher.py: 98%

149 statements