Coverage for src / local_deep_research / utilities / json_utils.py: 97%
81 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Centralized utilities for extracting and parsing JSON from LLM responses.
4Provides two public functions:
5- get_llm_response_text: Extract text from LLM response objects
6- extract_json: Parse JSON from LLM-generated text with robust cleaning
7"""
9import json
10import re
11from typing import Optional, Type, Union
13from loguru import logger
16def get_llm_response_text(response) -> str:
17 """Extract text content from an LLM response object.
19 Handles LangChain AIMessage (.content), plain text responses (.text),
20 and arbitrary objects (via str()). Removes <think> tags from the output.
22 Args:
23 response: LLM response object, string, or None.
25 Returns:
26 Extracted text with think tags removed. Empty string for None input.
27 """
28 if response is None:
29 return ""
30 if hasattr(response, "content") and response.content is not None:
31 raw = response.content
32 elif hasattr(response, "text") and response.text is not None:
33 raw = response.text
34 else:
35 raw = str(response)
36 if not isinstance(raw, str):
37 raw = str(raw)
38 return _remove_think_tags(raw)
41def extract_json(
42 text: str,
43 expected_type: Optional[Type] = None,
44) -> Optional[Union[dict, list]]:
45 """Extract and parse JSON from LLM-generated text.
47 Applies a cleaning pipeline to handle common LLM output patterns:
48 code fences, think tags, prose surrounding JSON, and minor artifacts.
50 Args:
51 text: Raw text potentially containing JSON.
52 expected_type: Expected JSON type (dict or list). If specified,
53 bracket extraction is ordered to prefer the matching type.
54 None accepts either type.
56 Returns:
57 Parsed dict or list, or None if no valid JSON found.
58 """
59 if not text or not text.strip():
60 return None
62 text = text.strip()
63 text = _strip_code_fences(text)
64 text = _remove_think_tags(text)
66 # Step 1: Try direct parse
67 try:
68 result = json.loads(text)
69 if isinstance(result, (dict, list)):
70 if expected_type is None or isinstance(result, expected_type):
71 return result
72 # Type mismatch — fall through to bracket extraction
73 except (json.JSONDecodeError, ValueError):
74 pass
76 # Step 2: Bracket extraction ordered by expected_type
77 if expected_type is list:
78 bracket_pairs = [("[", "]"), ("{", "}")]
79 elif expected_type is dict:
80 bracket_pairs = [("{", "}"), ("[", "]")]
81 else:
82 bracket_pairs = [("{", "}"), ("[", "]")]
84 for open_char, close_char in bracket_pairs:
85 extracted = _extract_by_brackets(text, open_char, close_char)
86 if extracted is None:
87 continue
89 # Try parsing the extracted substring
90 try:
91 result = json.loads(extracted)
92 if isinstance(result, (dict, list)): 92 ↛ 99line 92 didn't jump to line 99 because the condition on line 92 was always true
93 if expected_type is None or isinstance(result, expected_type):
94 return result
95 except (json.JSONDecodeError, ValueError):
96 pass
98 # Try cleaning LLM artifacts and retrying
99 cleaned = _clean_llm_json_artifacts(extracted)
100 if cleaned != extracted:
101 try:
102 result = json.loads(cleaned)
103 if isinstance(result, (dict, list)): 103 ↛ 84line 103 didn't jump to line 84 because the condition on line 103 was always true
104 if expected_type is None or isinstance( 104 ↛ 84line 104 didn't jump to line 84 because the condition on line 104 was always true
105 result, expected_type
106 ):
107 return result
108 except (json.JSONDecodeError, ValueError):
109 pass
111 logger.debug("No valid JSON found in text")
112 return None
115# ---------------------------------------------------------------------------
116# Private helpers
117# ---------------------------------------------------------------------------
120def _remove_think_tags(text: str) -> str:
121 """Remove <think>...</think> tags from text.
123 Duplicated from search_utilities to keep this module free of
124 internal package dependencies (follows type_utils.py convention).
125 """
126 text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
127 text = re.sub(r"</think>", "", text)
128 text = re.sub(r"<think>", "", text)
129 return text.strip()
132def _strip_code_fences(text: str) -> str:
133 """Remove markdown code fences from text.
135 Uses split-based extraction (not startswith) to handle fences
136 appearing mid-text with surrounding prose.
137 """
138 if "```json" in text:
139 parts = text.split("```json")
140 if len(parts) > 1: 140 ↛ 146line 140 didn't jump to line 146 because the condition on line 140 was always true
141 return parts[1].split("```")[0].strip()
142 elif "```" in text:
143 parts = text.split("```")
144 if len(parts) >= 3:
145 return parts[1].strip()
146 return text
149def _extract_by_brackets(
150 text: str, open_char: str, close_char: str
151) -> Optional[str]:
152 """Extract substring between outermost matching brackets.
154 Uses find()/rfind() which is equivalent to re.search with re.DOTALL
155 for bracket matching.
156 """
157 start = text.find(open_char)
158 end = text.rfind(close_char)
159 if start >= 0 and end > start:
160 return text[start : end + 1]
161 return None
164def _clean_llm_json_artifacts(text: str) -> str:
165 """Clean common LLM JSON artifacts from malformed JSON text.
167 Only called after json.loads has already failed, so the text is
168 already malformed. These regexes cannot corrupt valid JSON.
170 Handles:
171 - Trailing commas before ] or }
172 - Inline // comments
173 - Ellipsis entries (... or "...")
174 """
175 # Remove trailing commas
176 text = re.sub(r",\s*([}\]])", r"\1", text)
177 # Remove // line comments
178 text = re.sub(r"//[^\n]*", "", text)
179 # Remove ellipsis entries
180 text = re.sub(r',?\s*"?\.\.\.+"?\s*,?', "", text)
181 return text