Coverage for src / local_deep_research / document_loaders / bytes_loader.py: 94%
52 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Load documents from bytes content.
4This module provides functions to load documents from in-memory bytes,
5which is useful for handling file uploads via HTTP.
6"""
8import tempfile
9from pathlib import Path
10from typing import Optional
12from langchain_core.documents import Document
13from loguru import logger
15from local_deep_research.security.filename_sanitizer import sanitize_filename
17from .loader_registry import (
18 get_loader_class_for_extension,
19 is_extension_supported,
20)
23def load_from_bytes(
24 content: bytes,
25 extension: str,
26 filename: str = "upload",
27 source_url: Optional[str] = None,
28) -> list[Document]:
29 """
30 Load documents from bytes content.
32 This function writes the bytes to a temporary file, uses the appropriate
33 LangChain loader, then cleans up the temp file.
35 Args:
36 content: File content as bytes
37 extension: File extension (with or without leading dot)
38 filename: Original filename for metadata
39 source_url: Optional source URL for metadata
41 Returns:
42 List of Document objects with extracted content
44 Raises:
45 ValueError: If the extension is not supported
46 """
47 # Defense in depth: re-sanitize filename even though callers
48 # should have sanitized already
49 try:
50 filename = sanitize_filename(filename)
51 except Exception:
52 filename = "upload"
54 # Normalize extension
55 ext = (
56 extension.lower()
57 if extension.startswith(".")
58 else f".{extension.lower()}"
59 )
61 # Check if extension is supported
62 if not is_extension_supported(ext):
63 raise ValueError(f"Unsupported file extension: {ext}")
65 # Get the loader class for this extension
66 loader_info = get_loader_class_for_extension(ext)
67 if loader_info is None:
68 raise ValueError(f"No loader found for extension: {ext}")
70 loader_class, loader_kwargs = loader_info
72 # Create temp file with the content
73 tmp_path = None
74 try:
75 with tempfile.NamedTemporaryFile(
76 suffix=ext, delete=False, prefix="ldr_upload_"
77 ) as tmp:
78 tmp.write(content)
79 tmp_path = tmp.name
81 # Create loader and load documents
82 loader = loader_class(tmp_path, **loader_kwargs)
83 documents = loader.load()
85 # Add metadata to all documents
86 for doc in documents:
87 doc.metadata["original_filename"] = filename
88 if source_url:
89 doc.metadata["source_url"] = source_url
91 logger.info(
92 f"Loaded {len(documents)} document(s) from {filename} ({ext})"
93 )
94 return documents # type: ignore[no-any-return]
96 except Exception:
97 logger.exception(f"Error loading {filename} ({ext})")
98 raise
100 finally:
101 # Clean up temp file
102 if tmp_path:
103 try:
104 Path(tmp_path).unlink(missing_ok=True)
105 except Exception:
106 logger.warning(f"Failed to clean up temp file: {tmp_path}")
109def extract_text_from_bytes(
110 content: bytes,
111 extension: str,
112 filename: str = "upload",
113) -> Optional[str]:
114 """
115 Extract text from bytes content.
117 This is a convenience function that loads documents and joins their content.
119 Args:
120 content: File content as bytes
121 extension: File extension (with or without leading dot)
122 filename: Original filename for logging
124 Returns:
125 Extracted text as string, or None if extraction failed
126 """
127 try:
128 documents = load_from_bytes(content, extension, filename)
129 if documents:
130 return "\n\n".join(
131 doc.page_content for doc in documents if doc.page_content
132 )
133 return None
134 except ValueError:
135 logger.warning("Unsupported format")
136 return None
137 except Exception:
138 logger.exception(f"Error extracting text from {filename}")
139 return None