Coverage for src / local_deep_research / document_loaders / loader_registry.py: 76%

94 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Document loader registry - maps file extensions to LangChain loaders. 

3 

4This module provides a centralized registry for document loaders that can be used 

5by both collection uploads (bytes) and local search (file paths). 

6""" 

7 

8from pathlib import Path 

9from typing import Optional 

10 

11from langchain_community.document_loaders import ( 

12 CSVLoader, 

13 EverNoteLoader, 

14 MHTMLLoader, 

15 NotebookLoader, 

16 PyPDFLoader, 

17 TextLoader, 

18 TomlLoader, 

19 UnstructuredEmailLoader, 

20 UnstructuredExcelLoader, 

21 UnstructuredHTMLLoader, 

22 UnstructuredMarkdownLoader, 

23 UnstructuredPowerPointLoader, 

24 UnstructuredWordDocumentLoader, 

25 UnstructuredXMLLoader, 

26) 

27from langchain_core.document_loaders import BaseLoader 

28from loguru import logger 

29 

30# Import loaders that require external system tools (pandoc, tesseract, etc.) 

31# These may fail at runtime if the tools aren't installed 

32try: 

33 from langchain_community.document_loaders import UnstructuredODTLoader 

34 

35 HAS_ODT_LOADER = True 

36except ImportError: 

37 HAS_ODT_LOADER = False 

38 logger.debug("UnstructuredODTLoader not available - ODT support disabled") 

39 

40try: 

41 from langchain_community.document_loaders import UnstructuredEPubLoader 

42 

43 HAS_EPUB_LOADER = True 

44except ImportError: 

45 HAS_EPUB_LOADER = False 

46 logger.debug("UnstructuredEPubLoader not available - EPUB support disabled") 

47 

48try: 

49 from langchain_community.document_loaders import UnstructuredRTFLoader 

50 

51 HAS_RTF_LOADER = True 

52except ImportError: 

53 HAS_RTF_LOADER = False 

54 logger.debug("UnstructuredRTFLoader not available - RTF support disabled") 

55 

56try: 

57 from langchain_community.document_loaders import UnstructuredRSTLoader 

58 

59 HAS_RST_LOADER = True 

60except ImportError: 

61 HAS_RST_LOADER = False 

62 logger.debug("UnstructuredRSTLoader not available - RST support disabled") 

63 

64try: 

65 from langchain_community.document_loaders import UnstructuredOrgModeLoader 

66 

67 HAS_ORG_LOADER = True 

68except ImportError: 

69 HAS_ORG_LOADER = False 

70 logger.debug( 

71 "UnstructuredOrgModeLoader not available - Org support disabled" 

72 ) 

73 

74try: 

75 from langchain_community.document_loaders import UnstructuredImageLoader 

76 

77 HAS_IMAGE_LOADER = True 

78except ImportError: 

79 HAS_IMAGE_LOADER = False 

80 logger.debug( 

81 "UnstructuredImageLoader not available - Image/OCR support disabled" 

82 ) 

83 

84# Import our custom loaders 

85from .json_loader import SimpleJSONLoader 

86from .yaml_loader import YAMLLoader 

87 

88 

89# Extension to loader mapping 

90# Each entry contains: 

91# - loader_class: The LangChain loader class 

92# - loader_kwargs: Optional kwargs to pass to the loader 

93# - requires_path: Whether the loader requires a file path (vs bytes) 

94LOADER_REGISTRY: dict = { 

95 # PDF 

96 ".pdf": { 

97 "loader_class": PyPDFLoader, 

98 "loader_kwargs": {}, 

99 }, 

100 # Plain text 

101 ".txt": { 

102 "loader_class": TextLoader, 

103 "loader_kwargs": {"encoding": "utf-8", "autodetect_encoding": True}, 

104 }, 

105 # Markdown 

106 ".md": { 

107 "loader_class": UnstructuredMarkdownLoader, 

108 "loader_kwargs": {}, 

109 }, 

110 ".markdown": { 

111 "loader_class": UnstructuredMarkdownLoader, 

112 "loader_kwargs": {}, 

113 }, 

114 # Word documents 

115 ".docx": { 

116 "loader_class": UnstructuredWordDocumentLoader, 

117 "loader_kwargs": {}, 

118 }, 

119 ".doc": { 

120 "loader_class": UnstructuredWordDocumentLoader, 

121 "loader_kwargs": {}, 

122 }, 

123 # Spreadsheets 

124 ".csv": { 

125 "loader_class": CSVLoader, 

126 "loader_kwargs": {}, 

127 }, 

128 ".xlsx": { 

129 "loader_class": UnstructuredExcelLoader, 

130 "loader_kwargs": {}, 

131 }, 

132 ".xls": { 

133 "loader_class": UnstructuredExcelLoader, 

134 "loader_kwargs": {}, 

135 }, 

136 # HTML 

137 ".html": { 

138 "loader_class": UnstructuredHTMLLoader, 

139 "loader_kwargs": {}, 

140 }, 

141 ".htm": { 

142 "loader_class": UnstructuredHTMLLoader, 

143 "loader_kwargs": {}, 

144 }, 

145} 

146 

147# ODT (requires pypandoc) 

148if HAS_ODT_LOADER: 148 ↛ 155line 148 didn't jump to line 155 because the condition on line 148 was always true

149 LOADER_REGISTRY[".odt"] = { 

150 "loader_class": UnstructuredODTLoader, 

151 "loader_kwargs": {}, 

152 } 

153 

154# PowerPoint presentations 

155LOADER_REGISTRY[".ppt"] = { 

156 "loader_class": UnstructuredPowerPointLoader, 

157 "loader_kwargs": {}, 

158} 

159LOADER_REGISTRY[".pptx"] = { 

160 "loader_class": UnstructuredPowerPointLoader, 

161 "loader_kwargs": {}, 

162} 

163 

164# EPUB (ebooks, technical manuals) - requires pandoc 

165if HAS_EPUB_LOADER: 165 ↛ 172line 165 didn't jump to line 172 because the condition on line 165 was always true

166 LOADER_REGISTRY[".epub"] = { 

167 "loader_class": UnstructuredEPubLoader, 

168 "loader_kwargs": {}, 

169 } 

170 

171# RTF (Rich Text Format) - requires pandoc 

172if HAS_RTF_LOADER: 172 ↛ 179line 172 didn't jump to line 179 because the condition on line 172 was always true

173 LOADER_REGISTRY[".rtf"] = { 

174 "loader_class": UnstructuredRTFLoader, 

175 "loader_kwargs": {}, 

176 } 

177 

178# XML (important for USPTO patent data) 

179LOADER_REGISTRY[".xml"] = { 

180 "loader_class": UnstructuredXMLLoader, 

181 "loader_kwargs": {}, 

182} 

183 

184# RST (reStructuredText) - requires pandoc 

185if HAS_RST_LOADER: 185 ↛ 192line 185 didn't jump to line 192 because the condition on line 185 was always true

186 LOADER_REGISTRY[".rst"] = { 

187 "loader_class": UnstructuredRSTLoader, 

188 "loader_kwargs": {}, 

189 } 

190 

191# Org-mode files - requires pandoc 

192if HAS_ORG_LOADER: 192 ↛ 199line 192 didn't jump to line 199 because the condition on line 192 was always true

193 LOADER_REGISTRY[".org"] = { 

194 "loader_class": UnstructuredOrgModeLoader, 

195 "loader_kwargs": {}, 

196 } 

197 

198# Email files 

199LOADER_REGISTRY[".eml"] = { 

200 "loader_class": UnstructuredEmailLoader, 

201 "loader_kwargs": {}, 

202} 

203 

204# TSV (Tab-Separated Values) - use CSVLoader with tab delimiter 

205LOADER_REGISTRY[".tsv"] = { 

206 "loader_class": CSVLoader, 

207 "loader_kwargs": {"csv_args": {"delimiter": "\t"}}, 

208} 

209 

210# JSON loader using our custom SimpleJSONLoader (no jq dependency) 

211LOADER_REGISTRY[".json"] = { 

212 "loader_class": SimpleJSONLoader, 

213 "loader_kwargs": {}, 

214} 

215 

216# YAML loader using our custom YAMLLoader 

217LOADER_REGISTRY[".yaml"] = { 

218 "loader_class": YAMLLoader, 

219 "loader_kwargs": {}, 

220} 

221LOADER_REGISTRY[".yml"] = { 

222 "loader_class": YAMLLoader, 

223 "loader_kwargs": {}, 

224} 

225 

226# Images with OCR support (requires tesseract) 

227if HAS_IMAGE_LOADER: 227 ↛ 235line 227 didn't jump to line 235 because the condition on line 227 was always true

228 for ext in [".png", ".jpg", ".jpeg", ".tiff", ".tif", ".bmp", ".heic"]: 

229 LOADER_REGISTRY[ext] = { 

230 "loader_class": UnstructuredImageLoader, 

231 "loader_kwargs": {}, 

232 } 

233 

234# Jupyter notebooks (research code, data analysis) 

235LOADER_REGISTRY[".ipynb"] = { 

236 "loader_class": NotebookLoader, 

237 "loader_kwargs": { 

238 "include_outputs": True, 

239 "remove_newline": True, 

240 }, 

241} 

242 

243# Evernote exports 

244LOADER_REGISTRY[".enex"] = { 

245 "loader_class": EverNoteLoader, 

246 "loader_kwargs": {"load_single_document": False}, 

247} 

248 

249# TOML config files 

250LOADER_REGISTRY[".toml"] = { 

251 "loader_class": TomlLoader, 

252 "loader_kwargs": {}, 

253} 

254 

255# MHTML web archives (saved web pages) 

256LOADER_REGISTRY[".mhtml"] = { 

257 "loader_class": MHTMLLoader, 

258 "loader_kwargs": {}, 

259} 

260LOADER_REGISTRY[".mht"] = { 

261 "loader_class": MHTMLLoader, 

262 "loader_kwargs": {}, 

263} 

264 

265 

266def get_supported_extensions() -> list[str]: 

267 """Get list of all supported file extensions.""" 

268 return list(LOADER_REGISTRY.keys()) 

269 

270 

271def is_extension_supported(extension: str) -> bool: 

272 """Check if a file extension is supported.""" 

273 ext = ( 

274 extension.lower() 

275 if extension.startswith(".") 

276 else f".{extension.lower()}" 

277 ) 

278 return ext in LOADER_REGISTRY 

279 

280 

281def get_loader_for_path(file_path: str | Path) -> Optional[BaseLoader]: 

282 """ 

283 Get an appropriate document loader for a file based on its extension. 

284 

285 Args: 

286 file_path: Path to the file to load 

287 

288 Returns: 

289 A LangChain BaseLoader instance, or None if the extension is not supported 

290 """ 

291 file_path = Path(file_path) 

292 extension = file_path.suffix.lower() 

293 

294 loader_info = get_loader_class_for_extension(extension) 

295 if loader_info is None: 

296 logger.warning( 

297 f"Unsupported file extension: {extension} for {file_path}" 

298 ) 

299 return None 

300 

301 loader_class, loader_kwargs = loader_info 

302 

303 try: 

304 return loader_class(str(file_path), **loader_kwargs) 

305 except Exception: 

306 logger.exception(f"Error creating loader for {file_path}") 

307 return None 

308 

309 

310def get_loader_class_for_extension( 

311 extension: str, 

312) -> Optional[tuple[type, dict]]: 

313 """ 

314 Get the loader class and kwargs for an extension. 

315 

316 Args: 

317 extension: File extension (with or without leading dot) 

318 

319 Returns: 

320 Tuple of (loader_class, loader_kwargs) or None if not supported 

321 """ 

322 ext = ( 

323 extension.lower() 

324 if extension.startswith(".") 

325 else f".{extension.lower()}" 

326 ) 

327 

328 if ext not in LOADER_REGISTRY: 

329 return None 

330 

331 entry = LOADER_REGISTRY[ext] 

332 return entry["loader_class"], entry.get("loader_kwargs", {})