Coverage for src / local_deep_research / document_loaders / json_loader.py: 100%

59 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Custom JSON document loader. 

3 

4This loader reads JSON files and extracts all string values for text search. 

5It doesn't require the jq package like LangChain's JSONLoader. 

6""" 

7 

8import json 

9from pathlib import Path 

10from typing import Any, Iterator 

11 

12from langchain_core.document_loaders import BaseLoader 

13from langchain_core.documents import Document 

14from loguru import logger 

15 

16 

17def extract_strings_from_json( 

18 data: Any, path: str = "" 

19) -> list[tuple[str, str]]: 

20 """ 

21 Recursively extract all string values from JSON data. 

22 

23 Returns list of (path, value) tuples where path shows the JSON path. 

24 """ 

25 results = [] 

26 

27 if isinstance(data, str): 

28 results.append((path, data)) 

29 elif isinstance(data, dict): 

30 for key, value in data.items(): 

31 new_path = f"{path}.{key}" if path else key 

32 results.extend(extract_strings_from_json(value, new_path)) 

33 elif isinstance(data, list): 

34 for i, item in enumerate(data): 

35 new_path = f"{path}[{i}]" 

36 results.extend(extract_strings_from_json(item, new_path)) 

37 # Skip numbers, booleans, None - they're not useful for text search 

38 

39 return results 

40 

41 

42class SimpleJSONLoader(BaseLoader): 

43 """ 

44 Load JSON files and extract all string values for text search. 

45 

46 This loader recursively extracts all string values from the JSON structure, 

47 making them searchable. Numbers and booleans are skipped. 

48 """ 

49 

50 def __init__( 

51 self, 

52 file_path: str | Path, 

53 encoding: str = "utf-8", 

54 include_paths: bool = True, 

55 ): 

56 """ 

57 Initialize the JSON loader. 

58 

59 Args: 

60 file_path: Path to the JSON file 

61 encoding: File encoding (default: utf-8) 

62 include_paths: If True, include JSON paths with values (default: True) 

63 """ 

64 self.file_path = Path(file_path) 

65 self.encoding = encoding 

66 self.include_paths = include_paths 

67 

68 def lazy_load(self) -> Iterator[Document]: 

69 """ 

70 Lazily load documents from the JSON file. 

71 

72 Yields: 

73 Document objects containing the extracted text 

74 """ 

75 try: 

76 with open(self.file_path, encoding=self.encoding) as f: 

77 content = f.read() 

78 

79 try: 

80 data = json.loads(content) 

81 except json.JSONDecodeError: 

82 logger.exception(f"Invalid JSON in {self.file_path}") 

83 # Still yield the raw content if parsing fails 

84 yield Document( 

85 page_content=content, 

86 metadata={ 

87 "source": str(self.file_path), 

88 "parse_error": True, 

89 }, 

90 ) 

91 return 

92 

93 # Extract all string values 

94 string_pairs = extract_strings_from_json(data) 

95 

96 if self.include_paths: 

97 # Format as "path: value" for each string 

98 lines = [ 

99 f"{path}: {value}" for path, value in string_pairs if value 

100 ] 

101 else: 

102 # Just the values 

103 lines = [value for _, value in string_pairs if value] 

104 

105 text = "\n".join(lines) 

106 

107 yield Document( 

108 page_content=text, 

109 metadata={ 

110 "source": str(self.file_path), 

111 "file_type": "json", 

112 "string_count": len(string_pairs), 

113 }, 

114 ) 

115 

116 except Exception: 

117 logger.exception(f"Error loading JSON file: {self.file_path}") 

118 raise 

119 

120 def load(self) -> list[Document]: 

121 """Load all documents from the JSON file.""" 

122 return list(self.lazy_load()) 

123 

124 

125def extract_text_from_json(content: bytes, encoding: str = "utf-8") -> str: 

126 """ 

127 Extract text from JSON bytes content. 

128 

129 This is a convenience function for use with bytes (e.g., from file uploads). 

130 

131 Args: 

132 content: JSON content as bytes 

133 encoding: Text encoding (default: utf-8) 

134 

135 Returns: 

136 Extracted text with JSON paths and values 

137 """ 

138 try: 

139 text = content.decode(encoding) 

140 except UnicodeDecodeError: 

141 text = content.decode(encoding, errors="ignore") 

142 logger.warning( 

143 "JSON content had encoding errors, some characters may be lost" 

144 ) 

145 

146 try: 

147 data = json.loads(text) 

148 string_pairs = extract_strings_from_json(data) 

149 lines = [f"{path}: {value}" for path, value in string_pairs if value] 

150 return "\n".join(lines) 

151 except json.JSONDecodeError: 

152 logger.exception("Invalid JSON content") 

153 return text