Coverage for src / local_deep_research / research_library / downloaders / extraction / readability_extractor.py: 66%

23 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Mozilla Readability-based content extractor. 

3 

4Uses readabilipy (Python wrapper around Readability.js) to extract the 

5main article content from a page, stripping navigation, sidebars, and 

6other non-article elements at the DOM level. 

7 

8Uses Node.js for full Readability.js support when available; 

9readabilipy falls back to pure-Python mode automatically. 

10""" 

11 

12from typing import Optional 

13 

14from loguru import logger 

15 

16from .base import BaseExtractor 

17 

18 

19class ReadabilityExtractor(BaseExtractor): 

20 """Extract content using Mozilla Readability.js via readabilipy. 

21 

22 Returns cleaned HTML (not plain text) so downstream extractors 

23 like justext can still detect headings and structure. 

24 """ 

25 

26 def extract(self, html: str) -> Optional[str]: 

27 if not html or not html.strip(): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true

28 return None 

29 

30 try: 

31 from readabilipy import simple_json_from_html_string 

32 except ImportError: 

33 logger.warning( 

34 "readabilipy not installed — skipping Readability extraction" 

35 ) 

36 return None 

37 

38 try: 

39 # readabilipy checks for Node.js internally and falls back 

40 # to pure-Python mode automatically if it's not available. 

41 article = simple_json_from_html_string(html, use_readability=True) 

42 except Exception: 

43 logger.exception("readabilipy extraction failed") 

44 return None 

45 

46 if not article: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 return None 

48 

49 # Return HTML content only — preserves headings and structure 

50 # so downstream extractors (justext) can parse them properly. 

51 # Plain-text fallbacks are intentionally skipped: they would 

52 # break justext (which expects HTML) and the pipeline has its 

53 # own last-resort get_text() path. 

54 content = article.get("content") 

55 if content and isinstance(content, str) and content.strip(): 

56 return content 

57 

58 return None