Coverage for src/local_deep_research/research_library/downloaders/extraction/readability

1"""

2Mozilla Readability-based content extractor.

4Uses readabilipy (Python wrapper around Readability.js) to extract the

5main article content from a page, stripping navigation, sidebars, and

6other non-article elements at the DOM level.

8Uses Node.js for full Readability.js support when available;

9readabilipy falls back to pure-Python mode automatically.

10"""

12from typing import Optional

14from loguru import logger

16from .base import BaseExtractor

19class ReadabilityExtractor(BaseExtractor):

20 """Extract content using Mozilla Readability.js via readabilipy.

22 Returns cleaned HTML (not plain text) so downstream extractors

23 like justext can still detect headings and structure.

24 """

26 def extract(self, html: str) -> Optional[str]:

27 if not html or not html.strip():

28 return None

30 try:

31 from readabilipy import simple_json_from_html_string

32 except ImportError:

33 logger.warning(

34 "readabilipy not installed — skipping Readability extraction"

36 return None

38 try:

39 # readabilipy checks for Node.js internally and falls back

40 # to pure-Python mode automatically if it's not available.

41 article = simple_json_from_html_string(html, use_readability=True)

42 except Exception:

43 logger.exception("readabilipy extraction failed")

44 return None

46 if not article:

47 return None

49 # Return HTML content only — preserves headings and structure

50 # so downstream extractors (justext) can parse them properly.

51 # Plain-text fallbacks are intentionally skipped: they would

52 # break justext (which expects HTML) and the pipeline has its

53 # own last-resort get_text() path.

54 content = article.get("content")

55 if content and isinstance(content, str) and content.strip():

56 return content

58 return None

Coverage for src/local_deep_research/research_library/downloaders/extraction/readability_extractor.py: 100%