Coverage for src/local_deep_research/document_loaders/bytes

1"""

2Load documents from bytes content.

4This module provides functions to load documents from in-memory bytes,

5which is useful for handling file uploads via HTTP.

6"""

8import tempfile

9from pathlib import Path

10from typing import Optional

12from langchain_core.documents import Document

13from loguru import logger

15from .loader_registry import (

16 get_loader_class_for_extension,

17 is_extension_supported,

18)

21def load_from_bytes(

22 content: bytes,

23 extension: str,

24 filename: str = "upload",

25 source_url: Optional[str] = None,

26) -> list[Document]:

27 """

28 Load documents from bytes content.

30 This function writes the bytes to a temporary file, uses the appropriate

31 LangChain loader, then cleans up the temp file.

33 Args:

34 content: File content as bytes

35 extension: File extension (with or without leading dot)

36 filename: Original filename for metadata

37 source_url: Optional source URL for metadata

39 Returns:

40 List of Document objects with extracted content

42 Raises:

43 ValueError: If the extension is not supported

44 """

45 # Normalize extension

46 ext = (

47 extension.lower()

48 if extension.startswith(".")

49 else f".{extension.lower()}"

50 )

52 # Check if extension is supported

53 if not is_extension_supported(ext):

54 raise ValueError(f"Unsupported file extension: {ext}")

56 # Get the loader class for this extension

57 loader_info = get_loader_class_for_extension(ext)

58 if loader_info is None: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError(f"No loader found for extension: {ext}")

61 loader_class, loader_kwargs = loader_info

63 # Create temp file with the content

64 tmp_path = None

65 try:

66 with tempfile.NamedTemporaryFile(

67 suffix=ext, delete=False, prefix="ldr_upload_"

68 ) as tmp:

69 tmp.write(content)

70 tmp_path = tmp.name

72 # Create loader and load documents

73 loader = loader_class(tmp_path, **loader_kwargs)

74 documents = loader.load()

76 # Add metadata to all documents

77 for doc in documents:

78 doc.metadata["original_filename"] = filename

79 if source_url:

80 doc.metadata["source_url"] = source_url

82 logger.info(

83 f"Loaded {len(documents)} document(s) from {filename} ({ext})"

84 )

85 return documents

87 except Exception:

88 logger.exception(f"Error loading {filename} ({ext})")

89 raise

91 finally:

92 # Clean up temp file

93 if tmp_path:

94 try:

95 Path(tmp_path).unlink(missing_ok=True)

96 except Exception:

97 logger.warning(f"Failed to clean up temp file: {tmp_path}")

100def extract_text_from_bytes(

101 content: bytes,

102 extension: str,

103 filename: str = "upload",

104) -> Optional[str]:

105 """

106 Extract text from bytes content.

107

108 This is a convenience function that loads documents and joins their content.

109

110 Args:

111 content: File content as bytes

112 extension: File extension (with or without leading dot)

113 filename: Original filename for logging

114

115 Returns:

116 Extracted text as string, or None if extraction failed

117 """

118 try:

119 documents = load_from_bytes(content, extension, filename)

120 if documents: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true

121 return "\n\n".join(

122 doc.page_content for doc in documents if doc.page_content

123 )

124 return None

125 except ValueError as e:

126 logger.warning(f"Unsupported format: {e}")

127 return None

128 except Exception:

129 logger.exception(f"Error extracting text from {filename}")

130 return None

Coverage for src / local_deep_research / document_loaders / bytes_loader.py: 89%

47 statements