Coverage for src / local_deep_research / research_library / downloaders / extraction / trafilatura_extractor.py: 69%

24 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Trafilatura-based content extractor. 

3 

4Uses statistical + rule-based heuristics for boilerplate removal with 

5built-in language detection and optional markdown output. Trafilatura 

6internally falls back to readability and justext when its primary 

7heuristic fails, making it a strong standalone or first-pass extractor. 

8""" 

9 

10from typing import Optional 

11 

12from loguru import logger 

13 

14from .base import BaseExtractor 

15 

16 

17class TrafilaturaExtractor(BaseExtractor): 

18 """Extract content using trafilatura.""" 

19 

20 def __init__( 

21 self, 

22 output_format: str = "markdown", 

23 include_tables: bool = True, 

24 include_links: bool = False, 

25 include_comments: bool = False, 

26 include_formatting: bool = True, 

27 ): 

28 self.output_format = output_format 

29 self.include_tables = include_tables 

30 self.include_links = include_links 

31 self.include_comments = include_comments 

32 self.include_formatting = include_formatting 

33 

34 def extract(self, html: str) -> Optional[str]: 

35 if not html or not html.strip(): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 return None 

37 

38 try: 

39 import trafilatura 

40 except ImportError: 

41 logger.warning("trafilatura not installed — skipping extraction") 

42 return None 

43 

44 try: 

45 result = trafilatura.extract( 

46 html, 

47 output_format=self.output_format, 

48 include_tables=self.include_tables, 

49 include_links=self.include_links, 

50 include_comments=self.include_comments, 

51 include_formatting=self.include_formatting, 

52 ) 

53 return result if result and result.strip() else None 

54 except Exception: 

55 logger.exception("trafilatura extraction failed") 

56 return None