Coverage for src/local_deep_research/research_library/downloaders/extraction/justext_extractor.py: 100%
32 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2justext-based boilerplate removal extractor.
4Uses statistical NLP (paragraph length, stopword density, link density)
5to classify paragraphs as content vs. boilerplate.
6"""
8from typing import Optional
10from loguru import logger
12from .base import BaseExtractor
15class JustextExtractor(BaseExtractor):
16 """Extract content using justext boilerplate removal."""
18 def __init__(self, language: str = "English"):
19 self.language = language
21 def extract(self, html: str) -> Optional[str]:
22 if not html or not html.strip():
23 return None
25 try:
26 import justext
27 except ImportError:
28 logger.warning(
29 "justext not installed — skipping justext extraction"
30 )
31 return None
33 try:
34 stoplist = justext.get_stoplist(self.language)
35 except ValueError:
36 logger.warning(
37 f"justext stoplist not found for '{self.language}', "
38 "falling back to English"
39 )
40 stoplist = justext.get_stoplist("English")
42 try:
43 paragraphs = justext.justext(html, stoplist)
44 except Exception:
45 logger.exception("justext extraction failed")
46 return None
48 text_parts = []
49 for p in paragraphs:
50 if not p.is_boilerplate and p.text.strip():
51 if p.is_heading:
52 text_parts.append(f"\n## {p.text.strip()}\n")
53 else:
54 text_parts.append(p.text.strip())
56 content = "\n\n".join(text_parts)
57 return content if content.strip() else None