Coverage for src / local_deep_research / research_library / downloaders / extraction / newspaper_extractor.py: 68%
23 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2newspaper4k-based content extractor.
4Uses newspaper4k's article parser which combines structural analysis
5with NLP heuristics. Particularly strong on news front pages and
6multi-answer forum threads where it extracts more content than
7trafilatura. Run in parallel with the primary extractor and the
8longer result wins.
9"""
11from typing import Optional
13from loguru import logger
15from .base import BaseExtractor
18class NewspaperExtractor(BaseExtractor):
19 """Extract content using newspaper4k's article parser."""
21 def extract(self, html: str, url: str = "") -> Optional[str]:
22 if not html or not html.strip(): 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true
23 return None
25 try:
26 from newspaper import Article, Config
27 except ImportError:
28 logger.debug(
29 "newspaper4k not installed — skipping newspaper extraction"
30 )
31 return None
33 try:
34 # Disable image fetching to prevent SSRF: parse() calls
35 # fetch_images() by default, which makes outbound requests
36 # for every <img src="..."> in the HTML — attacker-controlled
37 # content could point these at internal services.
38 cfg = Config()
39 cfg.fetch_images = False
40 article = Article(url or "https://example.com", config=cfg)
41 article.download(input_html=html)
42 article.parse()
43 text = article.text
44 return text if text and text.strip() else None
45 except Exception:
46 logger.debug("newspaper4k extraction failed", exc_info=True)
47 return None