Coverage for src / local_deep_research / document_loaders / loader_registry.py: 76%
94 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Document loader registry - maps file extensions to LangChain loaders.
4This module provides a centralized registry for document loaders that can be used
5by both collection uploads (bytes) and local search (file paths).
6"""
8from pathlib import Path
9from typing import Optional
11from langchain_community.document_loaders import (
12 CSVLoader,
13 EverNoteLoader,
14 MHTMLLoader,
15 NotebookLoader,
16 PyPDFLoader,
17 TextLoader,
18 TomlLoader,
19 UnstructuredEmailLoader,
20 UnstructuredExcelLoader,
21 UnstructuredHTMLLoader,
22 UnstructuredMarkdownLoader,
23 UnstructuredPowerPointLoader,
24 UnstructuredWordDocumentLoader,
25 UnstructuredXMLLoader,
26)
27from langchain_core.document_loaders import BaseLoader
28from loguru import logger
30# Import loaders that require external system tools (pandoc, tesseract, etc.)
31# These may fail at runtime if the tools aren't installed
32try:
33 from langchain_community.document_loaders import UnstructuredODTLoader
35 HAS_ODT_LOADER = True
36except ImportError:
37 HAS_ODT_LOADER = False
38 logger.debug("UnstructuredODTLoader not available - ODT support disabled")
40try:
41 from langchain_community.document_loaders import UnstructuredEPubLoader
43 HAS_EPUB_LOADER = True
44except ImportError:
45 HAS_EPUB_LOADER = False
46 logger.debug("UnstructuredEPubLoader not available - EPUB support disabled")
48try:
49 from langchain_community.document_loaders import UnstructuredRTFLoader
51 HAS_RTF_LOADER = True
52except ImportError:
53 HAS_RTF_LOADER = False
54 logger.debug("UnstructuredRTFLoader not available - RTF support disabled")
56try:
57 from langchain_community.document_loaders import UnstructuredRSTLoader
59 HAS_RST_LOADER = True
60except ImportError:
61 HAS_RST_LOADER = False
62 logger.debug("UnstructuredRSTLoader not available - RST support disabled")
64try:
65 from langchain_community.document_loaders import UnstructuredOrgModeLoader
67 HAS_ORG_LOADER = True
68except ImportError:
69 HAS_ORG_LOADER = False
70 logger.debug(
71 "UnstructuredOrgModeLoader not available - Org support disabled"
72 )
74try:
75 from langchain_community.document_loaders import UnstructuredImageLoader
77 HAS_IMAGE_LOADER = True
78except ImportError:
79 HAS_IMAGE_LOADER = False
80 logger.debug(
81 "UnstructuredImageLoader not available - Image/OCR support disabled"
82 )
84# Import our custom loaders
85from .json_loader import SimpleJSONLoader
86from .yaml_loader import YAMLLoader
89# Extension to loader mapping
90# Each entry contains:
91# - loader_class: The LangChain loader class
92# - loader_kwargs: Optional kwargs to pass to the loader
93# - requires_path: Whether the loader requires a file path (vs bytes)
94LOADER_REGISTRY: dict = {
95 # PDF
96 ".pdf": {
97 "loader_class": PyPDFLoader,
98 "loader_kwargs": {},
99 },
100 # Plain text
101 ".txt": {
102 "loader_class": TextLoader,
103 "loader_kwargs": {"encoding": "utf-8", "autodetect_encoding": True},
104 },
105 # Markdown
106 ".md": {
107 "loader_class": UnstructuredMarkdownLoader,
108 "loader_kwargs": {},
109 },
110 ".markdown": {
111 "loader_class": UnstructuredMarkdownLoader,
112 "loader_kwargs": {},
113 },
114 # Word documents
115 ".docx": {
116 "loader_class": UnstructuredWordDocumentLoader,
117 "loader_kwargs": {},
118 },
119 ".doc": {
120 "loader_class": UnstructuredWordDocumentLoader,
121 "loader_kwargs": {},
122 },
123 # Spreadsheets
124 ".csv": {
125 "loader_class": CSVLoader,
126 "loader_kwargs": {},
127 },
128 ".xlsx": {
129 "loader_class": UnstructuredExcelLoader,
130 "loader_kwargs": {},
131 },
132 ".xls": {
133 "loader_class": UnstructuredExcelLoader,
134 "loader_kwargs": {},
135 },
136 # HTML
137 ".html": {
138 "loader_class": UnstructuredHTMLLoader,
139 "loader_kwargs": {},
140 },
141 ".htm": {
142 "loader_class": UnstructuredHTMLLoader,
143 "loader_kwargs": {},
144 },
145}
147# ODT (requires pypandoc)
148if HAS_ODT_LOADER: 148 ↛ 155line 148 didn't jump to line 155 because the condition on line 148 was always true
149 LOADER_REGISTRY[".odt"] = {
150 "loader_class": UnstructuredODTLoader,
151 "loader_kwargs": {},
152 }
154# PowerPoint presentations
155LOADER_REGISTRY[".ppt"] = {
156 "loader_class": UnstructuredPowerPointLoader,
157 "loader_kwargs": {},
158}
159LOADER_REGISTRY[".pptx"] = {
160 "loader_class": UnstructuredPowerPointLoader,
161 "loader_kwargs": {},
162}
164# EPUB (ebooks, technical manuals) - requires pandoc
165if HAS_EPUB_LOADER: 165 ↛ 172line 165 didn't jump to line 172 because the condition on line 165 was always true
166 LOADER_REGISTRY[".epub"] = {
167 "loader_class": UnstructuredEPubLoader,
168 "loader_kwargs": {},
169 }
171# RTF (Rich Text Format) - requires pandoc
172if HAS_RTF_LOADER: 172 ↛ 179line 172 didn't jump to line 179 because the condition on line 172 was always true
173 LOADER_REGISTRY[".rtf"] = {
174 "loader_class": UnstructuredRTFLoader,
175 "loader_kwargs": {},
176 }
178# XML (important for USPTO patent data)
179LOADER_REGISTRY[".xml"] = {
180 "loader_class": UnstructuredXMLLoader,
181 "loader_kwargs": {},
182}
184# RST (reStructuredText) - requires pandoc
185if HAS_RST_LOADER: 185 ↛ 192line 185 didn't jump to line 192 because the condition on line 185 was always true
186 LOADER_REGISTRY[".rst"] = {
187 "loader_class": UnstructuredRSTLoader,
188 "loader_kwargs": {},
189 }
191# Org-mode files - requires pandoc
192if HAS_ORG_LOADER: 192 ↛ 199line 192 didn't jump to line 199 because the condition on line 192 was always true
193 LOADER_REGISTRY[".org"] = {
194 "loader_class": UnstructuredOrgModeLoader,
195 "loader_kwargs": {},
196 }
198# Email files
199LOADER_REGISTRY[".eml"] = {
200 "loader_class": UnstructuredEmailLoader,
201 "loader_kwargs": {},
202}
204# TSV (Tab-Separated Values) - use CSVLoader with tab delimiter
205LOADER_REGISTRY[".tsv"] = {
206 "loader_class": CSVLoader,
207 "loader_kwargs": {"csv_args": {"delimiter": "\t"}},
208}
210# JSON loader using our custom SimpleJSONLoader (no jq dependency)
211LOADER_REGISTRY[".json"] = {
212 "loader_class": SimpleJSONLoader,
213 "loader_kwargs": {},
214}
216# YAML loader using our custom YAMLLoader
217LOADER_REGISTRY[".yaml"] = {
218 "loader_class": YAMLLoader,
219 "loader_kwargs": {},
220}
221LOADER_REGISTRY[".yml"] = {
222 "loader_class": YAMLLoader,
223 "loader_kwargs": {},
224}
226# Images with OCR support (requires tesseract)
227if HAS_IMAGE_LOADER: 227 ↛ 235line 227 didn't jump to line 235 because the condition on line 227 was always true
228 for ext in [".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".heic"]:
229 LOADER_REGISTRY[ext] = {
230 "loader_class": UnstructuredImageLoader,
231 "loader_kwargs": {},
232 }
234# Jupyter notebooks (research code, data analysis)
235LOADER_REGISTRY[".ipynb"] = {
236 "loader_class": NotebookLoader,
237 "loader_kwargs": {
238 "include_outputs": True,
239 "remove_newline": True,
240 },
241}
243# Evernote exports
244LOADER_REGISTRY[".enex"] = {
245 "loader_class": EverNoteLoader,
246 "loader_kwargs": {"load_single_document": False},
247}
249# TOML config files
250LOADER_REGISTRY[".toml"] = {
251 "loader_class": TomlLoader,
252 "loader_kwargs": {},
253}
255# MHTML web archives (saved web pages)
256LOADER_REGISTRY[".mhtml"] = {
257 "loader_class": MHTMLLoader,
258 "loader_kwargs": {},
259}
260LOADER_REGISTRY[".mht"] = {
261 "loader_class": MHTMLLoader,
262 "loader_kwargs": {},
263}
266def get_supported_extensions() -> list[str]:
267 """Get list of all supported file extensions."""
268 return list(LOADER_REGISTRY.keys())
271def is_extension_supported(extension: str) -> bool:
272 """Check if a file extension is supported."""
273 ext = (
274 extension.lower()
275 if extension.startswith(".")
276 else f".{extension.lower()}"
277 )
278 return ext in LOADER_REGISTRY
281def get_loader_for_path(file_path: str | Path) -> Optional[BaseLoader]:
282 """
283 Get an appropriate document loader for a file based on its extension.
285 Args:
286 file_path: Path to the file to load
288 Returns:
289 A LangChain BaseLoader instance, or None if the extension is not supported
290 """
291 file_path = Path(file_path)
292 extension = file_path.suffix.lower()
294 loader_info = get_loader_class_for_extension(extension)
295 if loader_info is None:
296 logger.warning(
297 f"Unsupported file extension: {extension} for {file_path}"
298 )
299 return None
301 loader_class, loader_kwargs = loader_info
303 try:
304 return loader_class(str(file_path), **loader_kwargs)
305 except Exception:
306 logger.exception(f"Error creating loader for {file_path}")
307 return None
310def get_loader_class_for_extension(
311 extension: str,
312) -> Optional[tuple[type, dict]]:
313 """
314 Get the loader class and kwargs for an extension.
316 Args:
317 extension: File extension (with or without leading dot)
319 Returns:
320 Tuple of (loader_class, loader_kwargs) or None if not supported
321 """
322 ext = (
323 extension.lower()
324 if extension.startswith(".")
325 else f".{extension.lower()}"
326 )
328 if ext not in LOADER_REGISTRY:
329 return None
331 entry = LOADER_REGISTRY[ext]
332 return entry["loader_class"], entry.get("loader_kwargs", {})