Coverage for src / local_deep_research / document_loaders / json_loader.py: 100%
59 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Custom JSON document loader.
4This loader reads JSON files and extracts all string values for text search.
5It doesn't require the jq package like LangChain's JSONLoader.
6"""
8import json
9from pathlib import Path
10from typing import Any, Iterator
12from langchain_core.document_loaders import BaseLoader
13from langchain_core.documents import Document
14from loguru import logger
17def extract_strings_from_json(
18 data: Any, path: str = ""
19) -> list[tuple[str, str]]:
20 """
21 Recursively extract all string values from JSON data.
23 Returns list of (path, value) tuples where path shows the JSON path.
24 """
25 results = []
27 if isinstance(data, str):
28 results.append((path, data))
29 elif isinstance(data, dict):
30 for key, value in data.items():
31 new_path = f"{path}.{key}" if path else key
32 results.extend(extract_strings_from_json(value, new_path))
33 elif isinstance(data, list):
34 for i, item in enumerate(data):
35 new_path = f"{path}[{i}]"
36 results.extend(extract_strings_from_json(item, new_path))
37 # Skip numbers, booleans, None - they're not useful for text search
39 return results
42class SimpleJSONLoader(BaseLoader):
43 """
44 Load JSON files and extract all string values for text search.
46 This loader recursively extracts all string values from the JSON structure,
47 making them searchable. Numbers and booleans are skipped.
48 """
50 def __init__(
51 self,
52 file_path: str | Path,
53 encoding: str = "utf-8",
54 include_paths: bool = True,
55 ):
56 """
57 Initialize the JSON loader.
59 Args:
60 file_path: Path to the JSON file
61 encoding: File encoding (default: utf-8)
62 include_paths: If True, include JSON paths with values (default: True)
63 """
64 self.file_path = Path(file_path)
65 self.encoding = encoding
66 self.include_paths = include_paths
68 def lazy_load(self) -> Iterator[Document]:
69 """
70 Lazily load documents from the JSON file.
72 Yields:
73 Document objects containing the extracted text
74 """
75 try:
76 with open(self.file_path, encoding=self.encoding) as f:
77 content = f.read()
79 try:
80 data = json.loads(content)
81 except json.JSONDecodeError:
82 logger.exception(f"Invalid JSON in {self.file_path}")
83 # Still yield the raw content if parsing fails
84 yield Document(
85 page_content=content,
86 metadata={
87 "source": str(self.file_path),
88 "parse_error": True,
89 },
90 )
91 return
93 # Extract all string values
94 string_pairs = extract_strings_from_json(data)
96 if self.include_paths:
97 # Format as "path: value" for each string
98 lines = [
99 f"{path}: {value}" for path, value in string_pairs if value
100 ]
101 else:
102 # Just the values
103 lines = [value for _, value in string_pairs if value]
105 text = "\n".join(lines)
107 yield Document(
108 page_content=text,
109 metadata={
110 "source": str(self.file_path),
111 "file_type": "json",
112 "string_count": len(string_pairs),
113 },
114 )
116 except Exception:
117 logger.exception(f"Error loading JSON file: {self.file_path}")
118 raise
120 def load(self) -> list[Document]:
121 """Load all documents from the JSON file."""
122 return list(self.lazy_load())
125def extract_text_from_json(content: bytes, encoding: str = "utf-8") -> str:
126 """
127 Extract text from JSON bytes content.
129 This is a convenience function for use with bytes (e.g., from file uploads).
131 Args:
132 content: JSON content as bytes
133 encoding: Text encoding (default: utf-8)
135 Returns:
136 Extracted text with JSON paths and values
137 """
138 try:
139 text = content.decode(encoding)
140 except UnicodeDecodeError:
141 text = content.decode(encoding, errors="ignore")
142 logger.warning(
143 "JSON content had encoding errors, some characters may be lost"
144 )
146 try:
147 data = json.loads(text)
148 string_pairs = extract_strings_from_json(data)
149 lines = [f"{path}: {value}" for path, value in string_pairs if value]
150 return "\n".join(lines)
151 except json.JSONDecodeError:
152 logger.exception("Invalid JSON content")
153 return text