Coverage for src/local_deep_research/utilities/json

1"""

2Centralized utilities for extracting and parsing JSON from LLM responses.

4Provides two public functions:

5- get_llm_response_text: Extract text from LLM response objects

6- extract_json: Parse JSON from LLM-generated text with robust cleaning

7"""

9import json

10import re

11from typing import Optional, Type, Union

13from loguru import logger

16def get_llm_response_text(response) -> str:

17 """Extract text content from an LLM response object.

19 Handles LangChain AIMessage (.content), plain text responses (.text),

20 and arbitrary objects (via str()). Removes <think> tags from the output.

22 Args:

23 response: LLM response object, string, or None.

25 Returns:

26 Extracted text with think tags removed. Empty string for None input.

27 """

28 if response is None:

29 return ""

30 if hasattr(response, "content") and response.content is not None:

31 raw = response.content

32 elif hasattr(response, "text") and response.text is not None:

33 raw = response.text

34 else:

35 raw = str(response)

36 if not isinstance(raw, str):

37 raw = str(raw)

38 return _remove_think_tags(raw)

41def extract_json(

42 text: str,

43 expected_type: Optional[Type] = None,

44) -> Optional[Union[dict, list]]:

45 """Extract and parse JSON from LLM-generated text.

47 Applies a cleaning pipeline to handle common LLM output patterns:

48 code fences, think tags, prose surrounding JSON, and minor artifacts.

50 Args:

51 text: Raw text potentially containing JSON.

52 expected_type: Expected JSON type (dict or list). If specified,

53 bracket extraction is ordered to prefer the matching type.

54 None accepts either type.

56 Returns:

57 Parsed dict or list, or None if no valid JSON found.

58 """

59 if not text or not text.strip():

60 return None

62 text = text.strip()

63 text = _strip_code_fences(text)

64 text = _remove_think_tags(text)

66 # Step 1: Try direct parse

67 try:

68 result = json.loads(text)

69 if isinstance(result, (dict, list)):

70 if expected_type is None or isinstance(result, expected_type):

71 return result

72 # Type mismatch — fall through to bracket extraction

73 except (json.JSONDecodeError, ValueError):

74 pass

76 # Step 2: Bracket extraction ordered by expected_type

77 if expected_type is list:

78 bracket_pairs = [("[", "]"), ("{", "}")]

79 elif expected_type is dict:

80 bracket_pairs = [("{", "}"), ("[", "]")]

81 else:

82 bracket_pairs = [("{", "}"), ("[", "]")]

84 for open_char, close_char in bracket_pairs:

85 extracted = _extract_by_brackets(text, open_char, close_char)

86 if extracted is None:

87 continue

89 # Try parsing the extracted substring

90 try:

91 result = json.loads(extracted)

92 if isinstance(result, (dict, list)): 92 ↛ 99line 92 didn't jump to line 99 because the condition on line 92 was always true

93 if expected_type is None or isinstance(result, expected_type):

94 return result

95 except (json.JSONDecodeError, ValueError):

96 pass

98 # Try cleaning LLM artifacts and retrying

99 cleaned = _clean_llm_json_artifacts(extracted)

100 if cleaned != extracted:

101 try:

102 result = json.loads(cleaned)

103 if isinstance(result, (dict, list)): 103 ↛ 84line 103 didn't jump to line 84 because the condition on line 103 was always true

104 if expected_type is None or isinstance( 104 ↛ 84line 104 didn't jump to line 84 because the condition on line 104 was always true

105 result, expected_type

106 ):

107 return result

108 except (json.JSONDecodeError, ValueError):

109 pass

110

111 logger.debug("No valid JSON found in text")

112 return None

113

114

115# ---------------------------------------------------------------------------

116# Private helpers

117# ---------------------------------------------------------------------------

118

119

120def _remove_think_tags(text: str) -> str:

121 """Remove <think>...</think> tags from text.

122

123 Duplicated from search_utilities to keep this module free of

124 internal package dependencies (follows type_utils.py convention).

125 """

126 text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)

127 text = re.sub(r"</think>", "", text)

128 text = re.sub(r"<think>", "", text)

129 return text.strip()

130

131

132def _strip_code_fences(text: str) -> str:

133 """Remove markdown code fences from text.

134

135 Uses split-based extraction (not startswith) to handle fences

136 appearing mid-text with surrounding prose.

137 """

138 if "```json" in text:

139 parts = text.split("```json")

140 if len(parts) > 1: 140 ↛ 146line 140 didn't jump to line 146 because the condition on line 140 was always true

141 return parts[1].split("```")[0].strip()

142 elif "```" in text:

143 parts = text.split("```")

144 if len(parts) >= 3:

145 return parts[1].strip()

146 return text

147

148

149def _extract_by_brackets(

150 text: str, open_char: str, close_char: str

151) -> Optional[str]:

152 """Extract substring between outermost matching brackets.

153

154 Uses find()/rfind() which is equivalent to re.search with re.DOTALL

155 for bracket matching.

156 """

157 start = text.find(open_char)

158 end = text.rfind(close_char)

159 if start >= 0 and end > start:

160 return text[start : end + 1]

161 return None

162

163

164def _clean_llm_json_artifacts(text: str) -> str:

165 """Clean common LLM JSON artifacts from malformed JSON text.

166

167 Only called after json.loads has already failed, so the text is

168 already malformed. These regexes cannot corrupt valid JSON.

169

170 Handles:

171 - Trailing commas before ] or }

172 - Inline // comments

173 - Ellipsis entries (... or "...")

174 """

175 # Remove trailing commas

176 text = re.sub(r",\s*([}\]])", r"\1", text)

177 # Remove // line comments

178 text = re.sub(r"//[^\n]*", "", text)

179 # Remove ellipsis entries, preserving comma separator when between items

180 text = re.sub(

181 r',\s*"?\.\.\.+"?\s*,', ",", text

182 ) # between items: keep one comma

183 return re.sub(r',?\s*"?\.\.\.+"?\s*', "", text) # trailing/leading ellipsis

Coverage for src/local_deep_research/utilities/json_utils.py: 97%

81 statements