Coverage for src / local_deep_research / research_library / downloaders / extraction / justext_extractor.py: 58%

32 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2justext-based boilerplate removal extractor. 

3 

4Uses statistical NLP (paragraph length, stopword density, link density) 

5to classify paragraphs as content vs. boilerplate. 

6""" 

7 

8from typing import Optional 

9 

10from loguru import logger 

11 

12from .base import BaseExtractor 

13 

14 

15class JustextExtractor(BaseExtractor): 

16 """Extract content using justext boilerplate removal.""" 

17 

18 def __init__(self, language: str = "English"): 

19 self.language = language 

20 

21 def extract(self, html: str) -> Optional[str]: 

22 if not html or not html.strip(): 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true

23 return None 

24 

25 try: 

26 import justext 

27 except ImportError: 

28 logger.warning( 

29 "justext not installed — skipping justext extraction" 

30 ) 

31 return None 

32 

33 try: 

34 stoplist = justext.get_stoplist(self.language) 

35 except ValueError: 

36 logger.warning( 

37 f"justext stoplist not found for '{self.language}', " 

38 "falling back to English" 

39 ) 

40 stoplist = justext.get_stoplist("English") 

41 

42 try: 

43 paragraphs = justext.justext(html, stoplist) 

44 except Exception: 

45 logger.exception("justext extraction failed") 

46 return None 

47 

48 text_parts = [] 

49 for p in paragraphs: 

50 if not p.is_boilerplate and p.text.strip(): 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 if p.is_heading: 

52 text_parts.append(f"\n## {p.text.strip()}\n") 

53 else: 

54 text_parts.append(p.text.strip()) 

55 

56 content = "\n\n".join(text_parts) 

57 return content if content.strip() else None