Coverage for src / local_deep_research / utilities / json_utils.py: 97%

81 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Centralized utilities for extracting and parsing JSON from LLM responses. 

3 

4Provides two public functions: 

5- get_llm_response_text: Extract text from LLM response objects 

6- extract_json: Parse JSON from LLM-generated text with robust cleaning 

7""" 

8 

9import json 

10import re 

11from typing import Optional, Type, Union 

12 

13from loguru import logger 

14 

15 

16def get_llm_response_text(response) -> str: 

17 """Extract text content from an LLM response object. 

18 

19 Handles LangChain AIMessage (.content), plain text responses (.text), 

20 and arbitrary objects (via str()). Removes <think> tags from the output. 

21 

22 Args: 

23 response: LLM response object, string, or None. 

24 

25 Returns: 

26 Extracted text with think tags removed. Empty string for None input. 

27 """ 

28 if response is None: 

29 return "" 

30 if hasattr(response, "content") and response.content is not None: 

31 raw = response.content 

32 elif hasattr(response, "text") and response.text is not None: 

33 raw = response.text 

34 else: 

35 raw = str(response) 

36 if not isinstance(raw, str): 

37 raw = str(raw) 

38 return _remove_think_tags(raw) 

39 

40 

41def extract_json( 

42 text: str, 

43 expected_type: Optional[Type] = None, 

44) -> Optional[Union[dict, list]]: 

45 """Extract and parse JSON from LLM-generated text. 

46 

47 Applies a cleaning pipeline to handle common LLM output patterns: 

48 code fences, think tags, prose surrounding JSON, and minor artifacts. 

49 

50 Args: 

51 text: Raw text potentially containing JSON. 

52 expected_type: Expected JSON type (dict or list). If specified, 

53 bracket extraction is ordered to prefer the matching type. 

54 None accepts either type. 

55 

56 Returns: 

57 Parsed dict or list, or None if no valid JSON found. 

58 """ 

59 if not text or not text.strip(): 

60 return None 

61 

62 text = text.strip() 

63 text = _strip_code_fences(text) 

64 text = _remove_think_tags(text) 

65 

66 # Step 1: Try direct parse 

67 try: 

68 result = json.loads(text) 

69 if isinstance(result, (dict, list)): 

70 if expected_type is None or isinstance(result, expected_type): 

71 return result 

72 # Type mismatch — fall through to bracket extraction 

73 except (json.JSONDecodeError, ValueError): 

74 pass 

75 

76 # Step 2: Bracket extraction ordered by expected_type 

77 if expected_type is list: 

78 bracket_pairs = [("[", "]"), ("{", "}")] 

79 elif expected_type is dict: 

80 bracket_pairs = [("{", "}"), ("[", "]")] 

81 else: 

82 bracket_pairs = [("{", "}"), ("[", "]")] 

83 

84 for open_char, close_char in bracket_pairs: 

85 extracted = _extract_by_brackets(text, open_char, close_char) 

86 if extracted is None: 

87 continue 

88 

89 # Try parsing the extracted substring 

90 try: 

91 result = json.loads(extracted) 

92 if isinstance(result, (dict, list)): 92 ↛ 99line 92 didn't jump to line 99 because the condition on line 92 was always true

93 if expected_type is None or isinstance(result, expected_type): 

94 return result 

95 except (json.JSONDecodeError, ValueError): 

96 pass 

97 

98 # Try cleaning LLM artifacts and retrying 

99 cleaned = _clean_llm_json_artifacts(extracted) 

100 if cleaned != extracted: 

101 try: 

102 result = json.loads(cleaned) 

103 if isinstance(result, (dict, list)): 103 ↛ 84line 103 didn't jump to line 84 because the condition on line 103 was always true

104 if expected_type is None or isinstance( 104 ↛ 84line 104 didn't jump to line 84 because the condition on line 104 was always true

105 result, expected_type 

106 ): 

107 return result 

108 except (json.JSONDecodeError, ValueError): 

109 pass 

110 

111 logger.debug("No valid JSON found in text") 

112 return None 

113 

114 

115# --------------------------------------------------------------------------- 

116# Private helpers 

117# --------------------------------------------------------------------------- 

118 

119 

120def _remove_think_tags(text: str) -> str: 

121 """Remove <think>...</think> tags from text. 

122 

123 Duplicated from search_utilities to keep this module free of 

124 internal package dependencies (follows type_utils.py convention). 

125 """ 

126 text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL) 

127 text = re.sub(r"</think>", "", text) 

128 text = re.sub(r"<think>", "", text) 

129 return text.strip() 

130 

131 

132def _strip_code_fences(text: str) -> str: 

133 """Remove markdown code fences from text. 

134 

135 Uses split-based extraction (not startswith) to handle fences 

136 appearing mid-text with surrounding prose. 

137 """ 

138 if "```json" in text: 

139 parts = text.split("```json") 

140 if len(parts) > 1: 140 ↛ 146line 140 didn't jump to line 146 because the condition on line 140 was always true

141 return parts[1].split("```")[0].strip() 

142 elif "```" in text: 

143 parts = text.split("```") 

144 if len(parts) >= 3: 

145 return parts[1].strip() 

146 return text 

147 

148 

149def _extract_by_brackets( 

150 text: str, open_char: str, close_char: str 

151) -> Optional[str]: 

152 """Extract substring between outermost matching brackets. 

153 

154 Uses find()/rfind() which is equivalent to re.search with re.DOTALL 

155 for bracket matching. 

156 """ 

157 start = text.find(open_char) 

158 end = text.rfind(close_char) 

159 if start >= 0 and end > start: 

160 return text[start : end + 1] 

161 return None 

162 

163 

164def _clean_llm_json_artifacts(text: str) -> str: 

165 """Clean common LLM JSON artifacts from malformed JSON text. 

166 

167 Only called after json.loads has already failed, so the text is 

168 already malformed. These regexes cannot corrupt valid JSON. 

169 

170 Handles: 

171 - Trailing commas before ] or } 

172 - Inline // comments 

173 - Ellipsis entries (... or "...") 

174 """ 

175 # Remove trailing commas 

176 text = re.sub(r",\s*([}\]])", r"\1", text) 

177 # Remove // line comments 

178 text = re.sub(r"//[^\n]*", "", text) 

179 # Remove ellipsis entries 

180 text = re.sub(r',?\s*"?\.\.\.+"?\s*,?', "", text) 

181 return text