Coverage for src / local_deep_research / research_library / downloaders / extraction / trafilatura_extractor.py: 69%
24 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Trafilatura-based content extractor.
4Uses statistical + rule-based heuristics for boilerplate removal with
5built-in language detection and optional markdown output. Trafilatura
6internally falls back to readability and justext when its primary
7heuristic fails, making it a strong standalone or first-pass extractor.
8"""
10from typing import Optional
12from loguru import logger
14from .base import BaseExtractor
17class TrafilaturaExtractor(BaseExtractor):
18 """Extract content using trafilatura."""
20 def __init__(
21 self,
22 output_format: str = "markdown",
23 include_tables: bool = True,
24 include_links: bool = False,
25 include_comments: bool = False,
26 include_formatting: bool = True,
27 ):
28 self.output_format = output_format
29 self.include_tables = include_tables
30 self.include_links = include_links
31 self.include_comments = include_comments
32 self.include_formatting = include_formatting
34 def extract(self, html: str) -> Optional[str]:
35 if not html or not html.strip(): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 return None
38 try:
39 import trafilatura
40 except ImportError:
41 logger.warning("trafilatura not installed — skipping extraction")
42 return None
44 try:
45 result = trafilatura.extract(
46 html,
47 output_format=self.output_format,
48 include_tables=self.include_tables,
49 include_links=self.include_links,
50 include_comments=self.include_comments,
51 include_formatting=self.include_formatting,
52 )
53 return result if result and result.strip() else None
54 except Exception:
55 logger.exception("trafilatura extraction failed")
56 return None