Coverage for src / local_deep_research / document_loaders / bytes_loader.py: 94%

52 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Load documents from bytes content. 

3 

4This module provides functions to load documents from in-memory bytes, 

5which is useful for handling file uploads via HTTP. 

6""" 

7 

8import tempfile 

9from pathlib import Path 

10from typing import Optional 

11 

12from langchain_core.documents import Document 

13from loguru import logger 

14 

15from local_deep_research.security.filename_sanitizer import sanitize_filename 

16 

17from .loader_registry import ( 

18 get_loader_class_for_extension, 

19 is_extension_supported, 

20) 

21 

22 

23def load_from_bytes( 

24 content: bytes, 

25 extension: str, 

26 filename: str = "upload", 

27 source_url: Optional[str] = None, 

28) -> list[Document]: 

29 """ 

30 Load documents from bytes content. 

31 

32 This function writes the bytes to a temporary file, uses the appropriate 

33 LangChain loader, then cleans up the temp file. 

34 

35 Args: 

36 content: File content as bytes 

37 extension: File extension (with or without leading dot) 

38 filename: Original filename for metadata 

39 source_url: Optional source URL for metadata 

40 

41 Returns: 

42 List of Document objects with extracted content 

43 

44 Raises: 

45 ValueError: If the extension is not supported 

46 """ 

47 # Defense in depth: re-sanitize filename even though callers 

48 # should have sanitized already 

49 try: 

50 filename = sanitize_filename(filename) 

51 except Exception: 

52 filename = "upload" 

53 

54 # Normalize extension 

55 ext = ( 

56 extension.lower() 

57 if extension.startswith(".") 

58 else f".{extension.lower()}" 

59 ) 

60 

61 # Check if extension is supported 

62 if not is_extension_supported(ext): 

63 raise ValueError(f"Unsupported file extension: {ext}") 

64 

65 # Get the loader class for this extension 

66 loader_info = get_loader_class_for_extension(ext) 

67 if loader_info is None: 

68 raise ValueError(f"No loader found for extension: {ext}") 

69 

70 loader_class, loader_kwargs = loader_info 

71 

72 # Create temp file with the content 

73 tmp_path = None 

74 try: 

75 with tempfile.NamedTemporaryFile( 

76 suffix=ext, delete=False, prefix="ldr_upload_" 

77 ) as tmp: 

78 tmp.write(content) 

79 tmp_path = tmp.name 

80 

81 # Create loader and load documents 

82 loader = loader_class(tmp_path, **loader_kwargs) 

83 documents = loader.load() 

84 

85 # Add metadata to all documents 

86 for doc in documents: 

87 doc.metadata["original_filename"] = filename 

88 if source_url: 

89 doc.metadata["source_url"] = source_url 

90 

91 logger.info( 

92 f"Loaded {len(documents)} document(s) from {filename} ({ext})" 

93 ) 

94 return documents # type: ignore[no-any-return] 

95 

96 except Exception: 

97 logger.exception(f"Error loading {filename} ({ext})") 

98 raise 

99 

100 finally: 

101 # Clean up temp file 

102 if tmp_path: 

103 try: 

104 Path(tmp_path).unlink(missing_ok=True) 

105 except Exception: 

106 logger.warning(f"Failed to clean up temp file: {tmp_path}") 

107 

108 

109def extract_text_from_bytes( 

110 content: bytes, 

111 extension: str, 

112 filename: str = "upload", 

113) -> Optional[str]: 

114 """ 

115 Extract text from bytes content. 

116 

117 This is a convenience function that loads documents and joins their content. 

118 

119 Args: 

120 content: File content as bytes 

121 extension: File extension (with or without leading dot) 

122 filename: Original filename for logging 

123 

124 Returns: 

125 Extracted text as string, or None if extraction failed 

126 """ 

127 try: 

128 documents = load_from_bytes(content, extension, filename) 

129 if documents: 

130 return "\n\n".join( 

131 doc.page_content for doc in documents if doc.page_content 

132 ) 

133 return None 

134 except ValueError: 

135 logger.warning("Unsupported format") 

136 return None 

137 except Exception: 

138 logger.exception(f"Error extracting text from {filename}") 

139 return None