Coverage for src/local_deep_research/research_library/downloaders/extraction/trafilatura

1"""

2Trafilatura-based content extractor.

4Uses statistical + rule-based heuristics for boilerplate removal with

5built-in language detection and optional markdown output. Trafilatura

6internally falls back to readability and justext when its primary

7heuristic fails, making it a strong standalone or first-pass extractor.

8"""

10from typing import Optional

12from loguru import logger

14from .base import BaseExtractor

17class TrafilaturaExtractor(BaseExtractor):

18 """Extract content using trafilatura."""

20 def __init__(

21 self,

22 output_format: str = "markdown",

23 include_tables: bool = True,

24 include_links: bool = False,

25 include_comments: bool = False,

26 include_formatting: bool = True,

27 ):

28 self.output_format = output_format

29 self.include_tables = include_tables

30 self.include_links = include_links

31 self.include_comments = include_comments

32 self.include_formatting = include_formatting

34 def extract(self, html: str) -> Optional[str]:

35 if not html or not html.strip():

36 return None

38 try:

39 import trafilatura

40 except ImportError:

41 logger.warning("trafilatura not installed — skipping extraction")

42 return None

44 try:

45 result = trafilatura.extract(

46 html,

47 output_format=self.output_format,

48 include_tables=self.include_tables,

49 include_links=self.include_links,

50 include_comments=self.include_comments,

51 include_formatting=self.include_formatting,

53 return result if result and result.strip() else None

54 except Exception:

55 logger.exception("trafilatura extraction failed")

56 return None

Coverage for src/local_deep_research/research_library/downloaders/extraction/trafilatura_extractor.py: 100%