Coverage for src / local_deep_research / research_library / downloaders / extraction / readability_extractor.py: 66%
23 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Mozilla Readability-based content extractor.
4Uses readabilipy (Python wrapper around Readability.js) to extract the
5main article content from a page, stripping navigation, sidebars, and
6other non-article elements at the DOM level.
8Uses Node.js for full Readability.js support when available;
9readabilipy falls back to pure-Python mode automatically.
10"""
12from typing import Optional
14from loguru import logger
16from .base import BaseExtractor
19class ReadabilityExtractor(BaseExtractor):
20 """Extract content using Mozilla Readability.js via readabilipy.
22 Returns cleaned HTML (not plain text) so downstream extractors
23 like justext can still detect headings and structure.
24 """
26 def extract(self, html: str) -> Optional[str]:
27 if not html or not html.strip(): 27 ↛ 28line 27 didn't jump to line 28 because the condition on line 27 was never true
28 return None
30 try:
31 from readabilipy import simple_json_from_html_string
32 except ImportError:
33 logger.warning(
34 "readabilipy not installed — skipping Readability extraction"
35 )
36 return None
38 try:
39 # readabilipy checks for Node.js internally and falls back
40 # to pure-Python mode automatically if it's not available.
41 article = simple_json_from_html_string(html, use_readability=True)
42 except Exception:
43 logger.exception("readabilipy extraction failed")
44 return None
46 if not article: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 return None
49 # Return HTML content only — preserves headings and structure
50 # so downstream extractors (justext) can parse them properly.
51 # Plain-text fallbacks are intentionally skipped: they would
52 # break justext (which expects HTML) and the pipeline has its
53 # own last-resort get_text() path.
54 content = article.get("content")
55 if content and isinstance(content, str) and content.strip():
56 return content
58 return None