Coverage for src / local_deep_research / research_library / downloaders / extraction / justext_extractor.py: 58%
32 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2justext-based boilerplate removal extractor.
4Uses statistical NLP (paragraph length, stopword density, link density)
5to classify paragraphs as content vs. boilerplate.
6"""
8from typing import Optional
10from loguru import logger
12from .base import BaseExtractor
15class JustextExtractor(BaseExtractor):
16 """Extract content using justext boilerplate removal."""
18 def __init__(self, language: str = "English"):
19 self.language = language
21 def extract(self, html: str) -> Optional[str]:
22 if not html or not html.strip(): 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true
23 return None
25 try:
26 import justext
27 except ImportError:
28 logger.warning(
29 "justext not installed — skipping justext extraction"
30 )
31 return None
33 try:
34 stoplist = justext.get_stoplist(self.language)
35 except ValueError:
36 logger.warning(
37 f"justext stoplist not found for '{self.language}', "
38 "falling back to English"
39 )
40 stoplist = justext.get_stoplist("English")
42 try:
43 paragraphs = justext.justext(html, stoplist)
44 except Exception:
45 logger.exception("justext extraction failed")
46 return None
48 text_parts = []
49 for p in paragraphs:
50 if not p.is_boilerplate and p.text.strip(): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 if p.is_heading:
52 text_parts.append(f"\n## {p.text.strip()}\n")
53 else:
54 text_parts.append(p.text.strip())
56 content = "\n\n".join(text_parts)
57 return content if content.strip() else None