Coverage for src / local_deep_research / research_library / downloaders / extraction / newspaper_extractor.py: 68%

23 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2newspaper4k-based content extractor. 

3 

4Uses newspaper4k's article parser which combines structural analysis 

5with NLP heuristics. Particularly strong on news front pages and 

6multi-answer forum threads where it extracts more content than 

7trafilatura. Run in parallel with the primary extractor and the 

8longer result wins. 

9""" 

10 

11from typing import Optional 

12 

13from loguru import logger 

14 

15from .base import BaseExtractor 

16 

17 

18class NewspaperExtractor(BaseExtractor): 

19 """Extract content using newspaper4k's article parser.""" 

20 

21 def extract(self, html: str, url: str = "") -> Optional[str]: 

22 if not html or not html.strip(): 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true

23 return None 

24 

25 try: 

26 from newspaper import Article, Config 

27 except ImportError: 

28 logger.debug( 

29 "newspaper4k not installed — skipping newspaper extraction" 

30 ) 

31 return None 

32 

33 try: 

34 # Disable image fetching to prevent SSRF: parse() calls 

35 # fetch_images() by default, which makes outbound requests 

36 # for every <img src="..."> in the HTML — attacker-controlled 

37 # content could point these at internal services. 

38 cfg = Config() 

39 cfg.fetch_images = False 

40 article = Article(url or "https://example.com", config=cfg) 

41 article.download(input_html=html) 

42 article.parse() 

43 text = article.text 

44 return text if text and text.strip() else None 

45 except Exception: 

46 logger.debug("newspaper4k extraction failed", exc_info=True) 

47 return None