Coverage for src / local_deep_research / research_library / downloaders / extraction / __init__.py: 100%

8 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Pluggable HTML content extraction strategies. 

3 

4Extractors can be composed in a pipeline: e.g. Readability first 

5(structural DOM scoping), then justext (statistical boilerplate removal). 

6""" 

7 

8from .base import BaseExtractor 

9from .justext_extractor import JustextExtractor 

10from .metadata_extractor import extract_metadata, metadata_to_text 

11from .newspaper_extractor import NewspaperExtractor 

12from .pipeline import ( 

13 batch_fetch_and_extract, 

14 extract_content, 

15 extract_content_with_metadata, 

16 fetch_and_extract, 

17) 

18from .readability_extractor import ReadabilityExtractor 

19from .trafilatura_extractor import TrafilaturaExtractor 

20 

21__all__ = [ 

22 "BaseExtractor", 

23 "JustextExtractor", 

24 "NewspaperExtractor", 

25 "ReadabilityExtractor", 

26 "TrafilaturaExtractor", 

27 "batch_fetch_and_extract", 

28 "extract_content", 

29 "extract_content_with_metadata", 

30 "extract_metadata", 

31 "fetch_and_extract", 

32 "metadata_to_text", 

33]