Coverage for src/local_deep_research/document_loaders/loader

1"""

2Document loader registry - maps file extensions to LangChain loaders.

4This module provides a centralized registry for document loaders that can be used

5by both collection uploads (bytes) and local search (file paths).

6"""

8import importlib.util

9import shutil

10from pathlib import Path

11from typing import Optional

13from langchain_community.document_loaders import (

14 CSVLoader,

15 EverNoteLoader,

16 MHTMLLoader,

17 NotebookLoader,

18 PyPDFLoader,

19 TextLoader,

20 TomlLoader,

21 UnstructuredEmailLoader,

22 UnstructuredExcelLoader,

23 UnstructuredHTMLLoader,

24 UnstructuredMarkdownLoader,

25 UnstructuredPowerPointLoader,

26 UnstructuredWordDocumentLoader,

27 UnstructuredXMLLoader,

28)

29from langchain_core.document_loaders import BaseLoader

30from loguru import logger

32# Import loaders that require external system tools (pandoc, tesseract, etc.)

33# These may fail at runtime if the tools aren't installed

34try:

35 from langchain_community.document_loaders import UnstructuredODTLoader

37 HAS_ODT_LOADER = True

38except ImportError:

39 HAS_ODT_LOADER = False

40 logger.debug("UnstructuredODTLoader not available - ODT support disabled")

42try:

43 from langchain_community.document_loaders import UnstructuredEPubLoader

45 HAS_EPUB_LOADER = True

46except ImportError:

47 HAS_EPUB_LOADER = False

48 logger.debug("UnstructuredEPubLoader not available - EPUB support disabled")

50try:

51 from langchain_community.document_loaders import UnstructuredRTFLoader

53 HAS_RTF_LOADER = True

54except ImportError:

55 HAS_RTF_LOADER = False

56 logger.debug("UnstructuredRTFLoader not available - RTF support disabled")

58try:

59 from langchain_community.document_loaders import UnstructuredRSTLoader

61 HAS_RST_LOADER = True

62except ImportError:

63 HAS_RST_LOADER = False

64 logger.debug("UnstructuredRSTLoader not available - RST support disabled")

66try:

67 from langchain_community.document_loaders import UnstructuredOrgModeLoader

69 HAS_ORG_LOADER = True

70except ImportError:

71 HAS_ORG_LOADER = False

72 logger.debug(

73 "UnstructuredOrgModeLoader not available - Org support disabled"

74 )

76try:

77 from langchain_community.document_loaders import UnstructuredImageLoader

79 HAS_IMAGE_LOADER = True

80except ImportError:

81 HAS_IMAGE_LOADER = False

82 logger.debug(

83 "UnstructuredImageLoader not available - Image/OCR support disabled"

84 )

86# Import our custom loaders

87from .json_loader import SimpleJSONLoader

88from .xls_loader import XLSLoader

89from .yaml_loader import YAMLLoader

92def _module_available(module_name: str) -> bool:

93 """Return True if *module_name* can be imported in this environment.

95 Uses ``importlib.util.find_spec`` so the (potentially heavy) module is

96 located but not actually imported.

97 """

98 try:

99 return importlib.util.find_spec(module_name) is not None

100 except (ImportError, ValueError, ModuleNotFoundError):

101 # A parent package may itself be missing, which surfaces as

102 # ModuleNotFoundError/ImportError from find_spec.

103 return False

104

105

106# Whether a loader *class* can be imported is NOT sufficient to know whether a

107# format actually works: the ``unstructured`` loaders import their real parser

108# (and its third-party dependency) lazily inside ``.load()``. So a format like

109# ODT can be advertised as supported, accept an upload, and only then fail with

110# ``ModuleNotFoundError: No module named 'docx'`` (issue #4414).

111#

112# These flags probe the actual runtime dependency so the registry only

113# advertises formats that will really extract text. Missing deps mean the

114# format is omitted and the upload path returns a clear "Unsupported format"

115# instead of a swallowed extraction failure.

116HAS_DOCX_DEP = _module_available("docx") # python-docx: .docx/.odt

117HAS_PPTX_DEP = _module_available("pptx") # python-pptx: .pptx

118# Legacy OLE binary office formats (.doc/.ppt) are NOT read by python-docx/

119# python-pptx; unstructured converts them to the modern format by shelling out

120# to LibreOffice (``soffice``). Without that binary they fail at runtime with

121# "soffice command was not found", so only advertise them when it is present.

122HAS_LIBREOFFICE = bool(

123 shutil.which("soffice") or shutil.which("libreoffice")

124) # .doc/.ppt conversion

125# unstructured.partition.xlsx imports msoffcrypto at module load, so openpyxl

126# alone is not enough to extract a modern .xlsx spreadsheet.

127HAS_XLSX_DEP = _module_available("openpyxl") and _module_available(

128 "msoffcrypto"

129) # openpyxl + msoffcrypto: .xlsx

130# Legacy .xls is read by our own XLSLoader (pandas + xlrd), which avoids the

131# msoffcrypto pre-check that crashes on some .xls files in the unstructured path.

132HAS_XLS_DEP = _module_available("xlrd") # xlrd: .xls

133HAS_PANDOC_DEP = _module_available(

134 "pypandoc"

135) # pandoc bridge: epub/rtf/rst/org

136HAS_OCR_DEP = _module_available("pytesseract") or _module_available(

137 "unstructured.pytesseract"

138) # tesseract OCR for image formats (also needs the tesseract binary)

139

140

141# Extension to loader mapping

142# Each entry contains:

143# - loader_class: The LangChain loader class

144# - loader_kwargs: Optional kwargs to pass to the loader

145# - requires_path: Whether the loader requires a file path (vs bytes)

146LOADER_REGISTRY: dict = {

147 # PDF

148 ".pdf": {

149 "loader_class": PyPDFLoader,

150 "loader_kwargs": {},

151 },

152 # Plain text

153 ".txt": {

154 "loader_class": TextLoader,

155 "loader_kwargs": {"encoding": "utf-8", "autodetect_encoding": True},

156 },

157 # Markdown

158 ".md": {

159 "loader_class": UnstructuredMarkdownLoader,

160 "loader_kwargs": {},

161 },

162 ".markdown": {

163 "loader_class": UnstructuredMarkdownLoader,

164 "loader_kwargs": {},

165 },

166 # CSV (Word/Excel are registered below, gated on their runtime deps)

167 ".csv": {

168 "loader_class": CSVLoader,

169 "loader_kwargs": {},

170 },

171 # HTML

172 ".html": {

173 "loader_class": UnstructuredHTMLLoader,

174 "loader_kwargs": {},

175 },

176 ".htm": {

177 "loader_class": UnstructuredHTMLLoader,

178 "loader_kwargs": {},

179 },

180}

181

182# Word documents - .docx needs python-docx; legacy .doc additionally needs

183# LibreOffice to convert the OLE binary to .docx first.

184if HAS_DOCX_DEP:

185 LOADER_REGISTRY[".docx"] = {

186 "loader_class": UnstructuredWordDocumentLoader,

187 "loader_kwargs": {},

188 }

189 if HAS_LIBREOFFICE: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 LOADER_REGISTRY[".doc"] = {

191 "loader_class": UnstructuredWordDocumentLoader,

192 "loader_kwargs": {},

193 }

194

195# Modern spreadsheets - require openpyxl + msoffcrypto (unstructured path)

196if HAS_XLSX_DEP: 196 ↛ 203line 196 didn't jump to line 203 because the condition on line 196 was always true

197 LOADER_REGISTRY[".xlsx"] = {

198 "loader_class": UnstructuredExcelLoader,

199 "loader_kwargs": {},

200 }

201

202# Legacy spreadsheets - read directly via our XLSLoader (pandas + xlrd)

203if HAS_XLS_DEP: 203 ↛ 210line 203 didn't jump to line 210 because the condition on line 203 was always true

204 LOADER_REGISTRY[".xls"] = {

205 "loader_class": XLSLoader,

206 "loader_kwargs": {},

207 }

208

209# ODT - the unstructured ODT partitioner imports python-docx internally

210if HAS_ODT_LOADER and HAS_DOCX_DEP:

211 LOADER_REGISTRY[".odt"] = {

212 "loader_class": UnstructuredODTLoader,

213 "loader_kwargs": {},

214 }

215

216# PowerPoint presentations - .pptx needs python-pptx; legacy .ppt additionally

217# needs LibreOffice to convert the OLE binary to .pptx first.

218if HAS_PPTX_DEP:

219 LOADER_REGISTRY[".pptx"] = {

220 "loader_class": UnstructuredPowerPointLoader,

221 "loader_kwargs": {},

222 }

223 if HAS_LIBREOFFICE: 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 LOADER_REGISTRY[".ppt"] = {

225 "loader_class": UnstructuredPowerPointLoader,

226 "loader_kwargs": {},

227 }

228

229# EPUB (ebooks, technical manuals) - requires pandoc

230if HAS_EPUB_LOADER and HAS_PANDOC_DEP:

231 LOADER_REGISTRY[".epub"] = {

232 "loader_class": UnstructuredEPubLoader,

233 "loader_kwargs": {},

234 }

235

236# RTF (Rich Text Format) - requires pandoc

237if HAS_RTF_LOADER and HAS_PANDOC_DEP:

238 LOADER_REGISTRY[".rtf"] = {

239 "loader_class": UnstructuredRTFLoader,

240 "loader_kwargs": {},

241 }

242

243# XML (important for USPTO patent data)

244LOADER_REGISTRY[".xml"] = {

245 "loader_class": UnstructuredXMLLoader,

246 "loader_kwargs": {},

247}

248

249# RST (reStructuredText) - requires pandoc

250if HAS_RST_LOADER and HAS_PANDOC_DEP:

251 LOADER_REGISTRY[".rst"] = {

252 "loader_class": UnstructuredRSTLoader,

253 "loader_kwargs": {},

254 }

255

256# Org-mode files - requires pandoc

257if HAS_ORG_LOADER and HAS_PANDOC_DEP:

258 LOADER_REGISTRY[".org"] = {

259 "loader_class": UnstructuredOrgModeLoader,

260 "loader_kwargs": {},

261 }

262

263# Email files

264LOADER_REGISTRY[".eml"] = {

265 "loader_class": UnstructuredEmailLoader,

266 "loader_kwargs": {},

267}

268

269# TSV (Tab-Separated Values) - use CSVLoader with tab delimiter

270LOADER_REGISTRY[".tsv"] = {

271 "loader_class": CSVLoader,

272 "loader_kwargs": {"csv_args": {"delimiter": "\t"}},

273}

274

275# JSON loader using our custom SimpleJSONLoader (no jq dependency)

276LOADER_REGISTRY[".json"] = {

277 "loader_class": SimpleJSONLoader,

278 "loader_kwargs": {},

279}

280

281# YAML loader using our custom YAMLLoader

282LOADER_REGISTRY[".yaml"] = {

283 "loader_class": YAMLLoader,

284 "loader_kwargs": {},

285}

286LOADER_REGISTRY[".yml"] = {

287 "loader_class": YAMLLoader,

288 "loader_kwargs": {},

289}

290

291# Images with OCR support - require pytesseract (and the tesseract binary)

292if HAS_IMAGE_LOADER and HAS_OCR_DEP: 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true

293 for ext in [".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".heic"]:

294 LOADER_REGISTRY[ext] = {

295 "loader_class": UnstructuredImageLoader,

296 "loader_kwargs": {},

297 }

298

299# Jupyter notebooks (research code, data analysis)

300LOADER_REGISTRY[".ipynb"] = {

301 "loader_class": NotebookLoader,

302 "loader_kwargs": {

303 "include_outputs": True,

304 "remove_newline": True,

305 },

306}

307

308# Evernote exports

309LOADER_REGISTRY[".enex"] = {

310 "loader_class": EverNoteLoader,

311 "loader_kwargs": {"load_single_document": False},

312}

313

314# TOML config files

315LOADER_REGISTRY[".toml"] = {

316 "loader_class": TomlLoader,

317 "loader_kwargs": {},

318}

319

320# MHTML web archives (saved web pages)

321LOADER_REGISTRY[".mhtml"] = {

322 "loader_class": MHTMLLoader,

323 "loader_kwargs": {},

324}

325LOADER_REGISTRY[".mht"] = {

326 "loader_class": MHTMLLoader,

327 "loader_kwargs": {},

328}

329

330

331def get_supported_extensions() -> list[str]:

332 """Get list of all supported file extensions."""

333 return list(LOADER_REGISTRY.keys())

334

335

336def is_extension_supported(extension: str) -> bool:

337 """Check if a file extension is supported."""

338 ext = (

339 extension.lower()

340 if extension.startswith(".")

341 else f".{extension.lower()}"

342 )

343 return ext in LOADER_REGISTRY

344

345

346def get_loader_for_path(file_path: str | Path) -> Optional[BaseLoader]:

347 """

348 Get an appropriate document loader for a file based on its extension.

349

350 Args:

351 file_path: Path to the file to load

352

353 Returns:

354 A LangChain BaseLoader instance, or None if the extension is not supported

355 """

356 file_path = Path(file_path)

357 extension = file_path.suffix.lower()

358

359 loader_info = get_loader_class_for_extension(extension)

360 if loader_info is None:

361 logger.warning(

362 f"Unsupported file extension: {extension} for {file_path}"

363 )

364 return None

365

366 loader_class, loader_kwargs = loader_info

367

368 try:

369 return loader_class(str(file_path), **loader_kwargs) # type: ignore[no-any-return]

370 except Exception:

371 logger.exception(f"Error creating loader for {file_path}")

372 return None

373

374

375def get_loader_class_for_extension(

376 extension: str,

377) -> Optional[tuple[type, dict]]:

378 """

379 Get the loader class and kwargs for an extension.

380

381 Args:

382 extension: File extension (with or without leading dot)

383

384 Returns:

385 Tuple of (loader_class, loader_kwargs) or None if not supported

386 """

387 ext = (

388 extension.lower()

389 if extension.startswith(".")

390 else f".{extension.lower()}"

391 )

392

393 if ext not in LOADER_REGISTRY:

394 return None

395

396 entry = LOADER_REGISTRY[ext]

397 return entry["loader_class"], entry.get("loader_kwargs", {})

Coverage for src/local_deep_research/document_loaders/loader_registry.py: 91%

119 statements