Coverage for src/local_deep_research/document_loaders/json

1"""

2Custom JSON document loader.

4This loader reads JSON files and extracts all string values for text search.

5It doesn't require the jq package like LangChain's JSONLoader.

6"""

8import json

9from pathlib import Path

10from typing import Any, Iterator

12from langchain_core.document_loaders import BaseLoader

13from langchain_core.documents import Document

14from loguru import logger

17def extract_strings_from_json(

18 data: Any, path: str = ""

19) -> list[tuple[str, str]]:

20 """

21 Recursively extract all string values from JSON data.

23 Returns list of (path, value) tuples where path shows the JSON path.

24 """

25 results = []

27 if isinstance(data, str):

28 results.append((path, data))

29 elif isinstance(data, dict):

30 for key, value in data.items():

31 new_path = f"{path}.{key}" if path else key

32 results.extend(extract_strings_from_json(value, new_path))

33 elif isinstance(data, list):

34 for i, item in enumerate(data):

35 new_path = f"{path}[{i}]"

36 results.extend(extract_strings_from_json(item, new_path))

37 # Skip numbers, booleans, None - they're not useful for text search

39 return results

42class SimpleJSONLoader(BaseLoader):

43 """

44 Load JSON files and extract all string values for text search.

46 This loader recursively extracts all string values from the JSON structure,

47 making them searchable. Numbers and booleans are skipped.

48 """

50 def __init__(

51 self,

52 file_path: str | Path,

53 encoding: str = "utf-8",

54 include_paths: bool = True,

55 ):

56 """

57 Initialize the JSON loader.

59 Args:

60 file_path: Path to the JSON file

61 encoding: File encoding (default: utf-8)

62 include_paths: If True, include JSON paths with values (default: True)

63 """

64 self.file_path = Path(file_path)

65 self.encoding = encoding

66 self.include_paths = include_paths

68 def lazy_load(self) -> Iterator[Document]:

69 """

70 Lazily load documents from the JSON file.

72 Yields:

73 Document objects containing the extracted text

74 """

75 try:

76 with open(self.file_path, encoding=self.encoding) as f:

77 content = f.read()

79 try:

80 data = json.loads(content)

81 except json.JSONDecodeError:

82 logger.exception(f"Invalid JSON in {self.file_path}")

83 # Still yield the raw content if parsing fails

84 yield Document(

85 page_content=content,

86 metadata={

87 "source": str(self.file_path),

88 "parse_error": True,

89 },

90 )

91 return

93 # Extract all string values

94 string_pairs = extract_strings_from_json(data)

96 if self.include_paths:

97 # Format as "path: value" for each string

98 lines = [

99 f"{path}: {value}" for path, value in string_pairs if value

100 ]

101 else:

102 # Just the values

103 lines = [value for _, value in string_pairs if value]

104

105 text = "\n".join(lines)

106

107 yield Document(

108 page_content=text,

109 metadata={

110 "source": str(self.file_path),

111 "file_type": "json",

112 "string_count": len(string_pairs),

113 },

114 )

115

116 except Exception:

117 logger.exception(f"Error loading JSON file: {self.file_path}")

118 raise

119

120 def load(self) -> list[Document]:

121 """Load all documents from the JSON file."""

122 return list(self.lazy_load())

123

124

125def extract_text_from_json(content: bytes, encoding: str = "utf-8") -> str:

126 """

127 Extract text from JSON bytes content.

128

129 This is a convenience function for use with bytes (e.g., from file uploads).

130

131 Args:

132 content: JSON content as bytes

133 encoding: Text encoding (default: utf-8)

134

135 Returns:

136 Extracted text with JSON paths and values

137 """

138 try:

139 text = content.decode(encoding)

140 except UnicodeDecodeError:

141 text = content.decode(encoding, errors="ignore")

142 logger.warning(

143 "JSON content had encoding errors, some characters may be lost"

144 )

145

146 try:

147 data = json.loads(text)

148 string_pairs = extract_strings_from_json(data)

149 lines = [f"{path}: {value}" for path, value in string_pairs if value]

150 return "\n".join(lines)

151 except json.JSONDecodeError:

152 logger.exception("Invalid JSON content")

153 return text

Coverage for src/local_deep_research/document_loaders/json_loader.py: 100%

59 statements