Coverage for src/local_deep_research/research_library/downloaders/extraction/newspaper

1"""

2newspaper4k-based content extractor.

4Uses newspaper4k's article parser which combines structural analysis

5with NLP heuristics. Particularly strong on news front pages and

6multi-answer forum threads where it extracts more content than

7trafilatura. Run in parallel with the primary extractor and the

8longer result wins.

9"""

11from typing import Optional

13from loguru import logger

15from .base import BaseExtractor

18class NewspaperExtractor(BaseExtractor):

19 """Extract content using newspaper4k's article parser."""

21 def extract(self, html: str, url: str = "") -> Optional[str]:

22 if not html or not html.strip():

23 return None

25 try:

26 from newspaper import Article, Config

27 except ImportError:

28 logger.debug(

29 "newspaper4k not installed — skipping newspaper extraction"

31 return None

33 try:

34 # Disable image fetching to prevent SSRF: parse() calls

35 # fetch_images() by default, which makes outbound requests

36 # for every <img src="..."> in the HTML — attacker-controlled

37 # content could point these at internal services.

38 cfg = Config()

39 cfg.fetch_images = False

40 article = Article(url or "https://example.com", config=cfg)

41 article.download(input_html=html)

42 article.parse()

43 text = article.text

44 return text if text and text.strip() else None

45 except Exception:

46 logger.debug("newspaper4k extraction failed", exc_info=True)

47 return None

Coverage for src/local_deep_research/research_library/downloaders/extraction/newspaper_extractor.py: 100%