Coverage for src / local_deep_research / document_loaders / bytes_loader.py: 89%
47 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Load documents from bytes content.
4This module provides functions to load documents from in-memory bytes,
5which is useful for handling file uploads via HTTP.
6"""
8import tempfile
9from pathlib import Path
10from typing import Optional
12from langchain_core.documents import Document
13from loguru import logger
15from .loader_registry import (
16 get_loader_class_for_extension,
17 is_extension_supported,
18)
21def load_from_bytes(
22 content: bytes,
23 extension: str,
24 filename: str = "upload",
25 source_url: Optional[str] = None,
26) -> list[Document]:
27 """
28 Load documents from bytes content.
30 This function writes the bytes to a temporary file, uses the appropriate
31 LangChain loader, then cleans up the temp file.
33 Args:
34 content: File content as bytes
35 extension: File extension (with or without leading dot)
36 filename: Original filename for metadata
37 source_url: Optional source URL for metadata
39 Returns:
40 List of Document objects with extracted content
42 Raises:
43 ValueError: If the extension is not supported
44 """
45 # Normalize extension
46 ext = (
47 extension.lower()
48 if extension.startswith(".")
49 else f".{extension.lower()}"
50 )
52 # Check if extension is supported
53 if not is_extension_supported(ext):
54 raise ValueError(f"Unsupported file extension: {ext}")
56 # Get the loader class for this extension
57 loader_info = get_loader_class_for_extension(ext)
58 if loader_info is None: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 raise ValueError(f"No loader found for extension: {ext}")
61 loader_class, loader_kwargs = loader_info
63 # Create temp file with the content
64 tmp_path = None
65 try:
66 with tempfile.NamedTemporaryFile(
67 suffix=ext, delete=False, prefix="ldr_upload_"
68 ) as tmp:
69 tmp.write(content)
70 tmp_path = tmp.name
72 # Create loader and load documents
73 loader = loader_class(tmp_path, **loader_kwargs)
74 documents = loader.load()
76 # Add metadata to all documents
77 for doc in documents:
78 doc.metadata["original_filename"] = filename
79 if source_url:
80 doc.metadata["source_url"] = source_url
82 logger.info(
83 f"Loaded {len(documents)} document(s) from {filename} ({ext})"
84 )
85 return documents
87 except Exception:
88 logger.exception(f"Error loading {filename} ({ext})")
89 raise
91 finally:
92 # Clean up temp file
93 if tmp_path:
94 try:
95 Path(tmp_path).unlink(missing_ok=True)
96 except Exception:
97 logger.warning(f"Failed to clean up temp file: {tmp_path}")
100def extract_text_from_bytes(
101 content: bytes,
102 extension: str,
103 filename: str = "upload",
104) -> Optional[str]:
105 """
106 Extract text from bytes content.
108 This is a convenience function that loads documents and joins their content.
110 Args:
111 content: File content as bytes
112 extension: File extension (with or without leading dot)
113 filename: Original filename for logging
115 Returns:
116 Extracted text as string, or None if extraction failed
117 """
118 try:
119 documents = load_from_bytes(content, extension, filename)
120 if documents: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true
121 return "\n\n".join(
122 doc.page_content for doc in documents if doc.page_content
123 )
124 return None
125 except ValueError as e:
126 logger.warning(f"Unsupported format: {e}")
127 return None
128 except Exception:
129 logger.exception(f"Error extracting text from {filename}")
130 return None