Coverage for src/local_deep_research/research_library/downloaders/extraction/justext

1"""

2justext-based boilerplate removal extractor.

4Uses statistical NLP (paragraph length, stopword density, link density)

5to classify paragraphs as content vs. boilerplate.

6"""

8from typing import Optional

10from loguru import logger

12from .base import BaseExtractor

15class JustextExtractor(BaseExtractor):

16 """Extract content using justext boilerplate removal."""

18 def __init__(self, language: str = "English"):

19 self.language = language

21 def extract(self, html: str) -> Optional[str]:

22 if not html or not html.strip():

23 return None

25 try:

26 import justext

27 except ImportError:

28 logger.warning(

29 "justext not installed — skipping justext extraction"

31 return None

33 try:

34 stoplist = justext.get_stoplist(self.language)

35 except ValueError:

36 logger.warning(

37 f"justext stoplist not found for '{self.language}', "

38 "falling back to English"

40 stoplist = justext.get_stoplist("English")

42 try:

43 paragraphs = justext.justext(html, stoplist)

44 except Exception:

45 logger.exception("justext extraction failed")

46 return None

48 text_parts = []

49 for p in paragraphs:

50 if not p.is_boilerplate and p.text.strip():

51 if p.is_heading:

52 text_parts.append(f"\n## {p.text.strip()}\n")

53 else:

54 text_parts.append(p.text.strip())

56 content = "\n\n".join(text_parts)

57 return content if content.strip() else None

Coverage for src/local_deep_research/research_library/downloaders/extraction/justext_extractor.py: 100%