Coverage for src / local_deep_research / document_loaders / bytes_loader.py: 89%

47 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Load documents from bytes content. 

3 

4This module provides functions to load documents from in-memory bytes, 

5which is useful for handling file uploads via HTTP. 

6""" 

7 

8import tempfile 

9from pathlib import Path 

10from typing import Optional 

11 

12from langchain_core.documents import Document 

13from loguru import logger 

14 

15from .loader_registry import ( 

16 get_loader_class_for_extension, 

17 is_extension_supported, 

18) 

19 

20 

21def load_from_bytes( 

22 content: bytes, 

23 extension: str, 

24 filename: str = "upload", 

25 source_url: Optional[str] = None, 

26) -> list[Document]: 

27 """ 

28 Load documents from bytes content. 

29 

30 This function writes the bytes to a temporary file, uses the appropriate 

31 LangChain loader, then cleans up the temp file. 

32 

33 Args: 

34 content: File content as bytes 

35 extension: File extension (with or without leading dot) 

36 filename: Original filename for metadata 

37 source_url: Optional source URL for metadata 

38 

39 Returns: 

40 List of Document objects with extracted content 

41 

42 Raises: 

43 ValueError: If the extension is not supported 

44 """ 

45 # Normalize extension 

46 ext = ( 

47 extension.lower() 

48 if extension.startswith(".") 

49 else f".{extension.lower()}" 

50 ) 

51 

52 # Check if extension is supported 

53 if not is_extension_supported(ext): 

54 raise ValueError(f"Unsupported file extension: {ext}") 

55 

56 # Get the loader class for this extension 

57 loader_info = get_loader_class_for_extension(ext) 

58 if loader_info is None: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 raise ValueError(f"No loader found for extension: {ext}") 

60 

61 loader_class, loader_kwargs = loader_info 

62 

63 # Create temp file with the content 

64 tmp_path = None 

65 try: 

66 with tempfile.NamedTemporaryFile( 

67 suffix=ext, delete=False, prefix="ldr_upload_" 

68 ) as tmp: 

69 tmp.write(content) 

70 tmp_path = tmp.name 

71 

72 # Create loader and load documents 

73 loader = loader_class(tmp_path, **loader_kwargs) 

74 documents = loader.load() 

75 

76 # Add metadata to all documents 

77 for doc in documents: 

78 doc.metadata["original_filename"] = filename 

79 if source_url: 

80 doc.metadata["source_url"] = source_url 

81 

82 logger.info( 

83 f"Loaded {len(documents)} document(s) from {filename} ({ext})" 

84 ) 

85 return documents 

86 

87 except Exception: 

88 logger.exception(f"Error loading {filename} ({ext})") 

89 raise 

90 

91 finally: 

92 # Clean up temp file 

93 if tmp_path: 

94 try: 

95 Path(tmp_path).unlink(missing_ok=True) 

96 except Exception: 

97 logger.warning(f"Failed to clean up temp file: {tmp_path}") 

98 

99 

100def extract_text_from_bytes( 

101 content: bytes, 

102 extension: str, 

103 filename: str = "upload", 

104) -> Optional[str]: 

105 """ 

106 Extract text from bytes content. 

107 

108 This is a convenience function that loads documents and joins their content. 

109 

110 Args: 

111 content: File content as bytes 

112 extension: File extension (with or without leading dot) 

113 filename: Original filename for logging 

114 

115 Returns: 

116 Extracted text as string, or None if extraction failed 

117 """ 

118 try: 

119 documents = load_from_bytes(content, extension, filename) 

120 if documents: 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true

121 return "\n\n".join( 

122 doc.page_content for doc in documents if doc.page_content 

123 ) 

124 return None 

125 except ValueError as e: 

126 logger.warning(f"Unsupported format: {e}") 

127 return None 

128 except Exception: 

129 logger.exception(f"Error extracting text from {filename}") 

130 return None