Coverage for src / local_deep_research / advanced_search_system / knowledge / followup_context_manager.py: 97%

107 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Follow-up Context Manager 

3 

4Manages and processes past research context for follow-up questions. 

5This is a standalone class that doesn't inherit from BaseKnowledgeGenerator 

6to avoid implementing many abstract methods. 

7""" 

8 

9from typing import Dict, List, Any, Optional 

10from loguru import logger 

11 

12from langchain_core.language_models.chat_models import BaseChatModel 

13from ...utilities.search_utilities import remove_think_tags 

14 

15 

16class FollowUpContextHandler: 

17 """ 

18 Manages past research context for follow-up research. 

19 

20 This class handles: 

21 1. Loading and structuring past research data 

22 2. Summarizing findings for follow-up context 

23 3. Extracting relevant information for new searches 

24 4. Building comprehensive context for strategies 

25 """ 

26 

27 def __init__( 

28 self, model: BaseChatModel, settings_snapshot: Optional[Dict] = None 

29 ): 

30 """ 

31 Initialize the context manager. 

32 

33 Args: 

34 model: Language model for processing context 

35 settings_snapshot: Optional settings snapshot 

36 """ 

37 self.model = model 

38 self.settings_snapshot = settings_snapshot or {} 

39 self.past_research_cache = {} 

40 

41 def build_context( 

42 self, research_data: Dict[str, Any], follow_up_query: str 

43 ) -> Dict[str, Any]: 

44 """ 

45 Build comprehensive context from past research. 

46 

47 Args: 

48 research_data: Past research data including findings, sources, etc. 

49 follow_up_query: The follow-up question being asked 

50 

51 Returns: 

52 Structured context dictionary for follow-up research 

53 """ 

54 logger.info(f"Building context for follow-up: {follow_up_query}") 

55 

56 # Extract all components 

57 return { 

58 "parent_research_id": research_data.get("research_id", ""), 

59 "original_query": research_data.get("query", ""), 

60 "follow_up_query": follow_up_query, 

61 "past_findings": self._extract_findings(research_data), 

62 "past_sources": self._extract_sources(research_data), 

63 "key_entities": self._extract_entities(research_data), 

64 "summary": self._create_summary(research_data, follow_up_query), 

65 "report_content": research_data.get("report_content", ""), 

66 "formatted_findings": research_data.get("formatted_findings", ""), 

67 "all_links_of_system": research_data.get("all_links_of_system", []), 

68 "metadata": self._extract_metadata(research_data), 

69 } 

70 

71 def _extract_findings(self, research_data: Dict) -> str: 

72 """ 

73 Extract and format findings from past research. 

74 

75 Args: 

76 research_data: Past research data 

77 

78 Returns: 

79 Formatted findings string 

80 """ 

81 findings_parts = [] 

82 

83 # Check various possible locations for findings 

84 if formatted := research_data.get("formatted_findings"): 

85 findings_parts.append(formatted) 

86 

87 if report := research_data.get("report_content"): 

88 # Take first part of report if no formatted findings 

89 if not findings_parts: 

90 findings_parts.append(report[:2000]) 

91 

92 if not findings_parts: 

93 return "No previous findings available" 

94 

95 return "\n\n".join(findings_parts) 

96 

97 def _extract_sources(self, research_data: Dict) -> List[Dict]: 

98 """ 

99 Extract and structure sources from past research. 

100 

101 Args: 

102 research_data: Past research data 

103 

104 Returns: 

105 List of source dictionaries 

106 """ 

107 sources = [] 

108 seen_urls = set() 

109 

110 # Check all possible source fields 

111 for field in ["resources", "all_links_of_system", "past_links"]: 

112 if field_sources := research_data.get(field, []): 

113 for source in field_sources: 

114 url = source.get("url", "") 

115 # Avoid duplicates by URL 

116 if url and url not in seen_urls: 

117 sources.append(source) 

118 seen_urls.add(url) 

119 elif not url: 

120 # Include sources without URLs (shouldn't happen but be safe) 

121 sources.append(source) 

122 

123 return sources 

124 

125 def _extract_entities(self, research_data: Dict) -> List[str]: 

126 """ 

127 Extract key entities from past research. 

128 

129 Args: 

130 research_data: Past research data 

131 

132 Returns: 

133 List of key entities 

134 """ 

135 findings = self._extract_findings(research_data) 

136 

137 if not findings or not self.model: 

138 return [] 

139 

140 prompt = f""" 

141Extract key entities (names, places, organizations, concepts) from these research findings: 

142 

143{findings[:2000]} 

144 

145Return up to 10 most important entities, one per line. 

146""" 

147 

148 try: 

149 response = self.model.invoke(prompt) 

150 entities = [ 

151 line.strip() 

152 for line in remove_think_tags(response.content) 

153 .strip() 

154 .split("\n") 

155 if line.strip() 

156 ] 

157 return entities[:10] 

158 except Exception: 

159 logger.warning("Failed to extract entities") 

160 return [] 

161 

162 def _create_summary(self, research_data: Dict, follow_up_query: str) -> str: 

163 """ 

164 Create a targeted summary of past research relevant to the follow-up question. 

165 This is used internally for building context. 

166 

167 Args: 

168 research_data: Past research data 

169 follow_up_query: The follow-up question 

170 

171 Returns: 

172 Targeted summary for context building 

173 """ 

174 findings = self._extract_findings(research_data) 

175 original_query = research_data.get("query", "") 

176 

177 # For internal context, create a brief targeted summary 

178 return self._generate_summary( 

179 findings=findings, 

180 query=follow_up_query, 

181 original_query=original_query, 

182 max_sentences=5, 

183 purpose="context", 

184 ) 

185 

186 def _extract_metadata(self, research_data: Dict) -> Dict: 

187 """ 

188 Extract metadata from past research. 

189 

190 Args: 

191 research_data: Past research data 

192 

193 Returns: 

194 Metadata dictionary 

195 """ 

196 return { 

197 "strategy": research_data.get("strategy", ""), 

198 "mode": research_data.get("mode", ""), 

199 "created_at": research_data.get("created_at", ""), 

200 "research_meta": research_data.get("research_meta", {}), 

201 } 

202 

203 def summarize_for_followup( 

204 self, findings: str, query: str, max_length: int = 1000 

205 ) -> str: 

206 """ 

207 Create a concise summary of findings for external use (e.g., in prompts). 

208 This creates a length-constrained summary suitable for inclusion in LLM prompts. 

209 

210 Args: 

211 findings: Past research findings 

212 query: Follow-up query 

213 max_length: Maximum length of summary in characters 

214 

215 Returns: 

216 Concise summary constrained to max_length 

217 """ 

218 # Use the shared summary generation with specific parameters for external use 

219 return self._generate_summary( 

220 findings=findings, 

221 query=query, 

222 original_query=None, 

223 max_sentences=max_length 

224 // 100, # Approximate sentences based on length 

225 purpose="prompt", 

226 max_length=max_length, 

227 ) 

228 

229 def _generate_summary( 

230 self, 

231 findings: str, 

232 query: str, 

233 original_query: Optional[str] = None, 

234 max_sentences: int = 5, 

235 purpose: str = "context", 

236 max_length: Optional[int] = None, 

237 ) -> str: 

238 """ 

239 Shared summary generation logic. 

240 

241 Args: 

242 findings: Research findings to summarize 

243 query: Follow-up query 

244 original_query: Original research query (optional) 

245 max_sentences: Maximum number of sentences 

246 purpose: Purpose of summary ("context" or "prompt") 

247 max_length: Maximum character length (optional) 

248 

249 Returns: 

250 Generated summary 

251 """ 

252 if not findings: 

253 return "" 

254 

255 # If findings are already short enough, return as-is 

256 if max_length and len(findings) <= max_length: 

257 return findings 

258 

259 if not self.model: 

260 # Fallback without model 

261 if max_length: 

262 return findings[:max_length] + "..." 

263 return findings[:500] + "..." 

264 

265 # Build prompt based on purpose 

266 if purpose == "context" and original_query: 

267 prompt = f""" 

268Create a brief summary of previous research findings that are relevant to this follow-up question: 

269 

270Original research question: "{original_query}" 

271Follow-up question: "{query}" 

272 

273Previous findings: 

274{findings[:3000]} 

275 

276Provide a {max_sentences}-sentence summary focusing on aspects relevant to the follow-up question. 

277""" 

278 else: 

279 prompt = f""" 

280Summarize these research findings in relation to the follow-up question: 

281 

282Follow-up question: "{query}" 

283 

284Findings: 

285{findings[:4000]} 

286 

287Create a summary of {max_sentences} sentences that captures the most relevant information. 

288""" 

289 

290 try: 

291 response = self.model.invoke(prompt) 

292 summary = remove_think_tags(response.content).strip() 

293 

294 # Apply length constraint if specified 

295 if max_length and len(summary) > max_length: 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 summary = summary[:max_length] + "..." 

297 

298 return summary 

299 except Exception: 

300 logger.warning("Summary generation failed") 

301 # Fallback to truncation 

302 if max_length: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true

303 return findings[:max_length] + "..." 

304 return findings[:500] + "..." 

305 

306 def identify_gaps( 

307 self, research_data: Dict, follow_up_query: str 

308 ) -> List[str]: 

309 """ 

310 Identify information gaps that the follow-up should address. 

311 

312 Args: 

313 research_data: Past research data 

314 follow_up_query: Follow-up question 

315 

316 Returns: 

317 List of identified gaps 

318 """ 

319 findings = self._extract_findings(research_data) 

320 

321 if not findings or not self.model: 

322 return [] 

323 

324 prompt = f""" 

325Based on the previous research and the follow-up question, identify information gaps: 

326 

327Previous research findings: 

328{findings[:2000]} 

329 

330Follow-up question: "{follow_up_query}" 

331 

332What specific information is missing or needs clarification? List up to 5 gaps, one per line. 

333""" 

334 

335 try: 

336 response = self.model.invoke(prompt) 

337 gaps = [ 

338 line.strip() 

339 for line in remove_think_tags(response.content) 

340 .strip() 

341 .split("\n") 

342 if line.strip() 

343 ] 

344 return gaps[:5] 

345 except Exception: 

346 logger.warning("Failed to identify gaps") 

347 return [] 

348 

349 def format_for_settings_snapshot( 

350 self, context: Dict[str, Any] 

351 ) -> Dict[str, Any]: 

352 """ 

353 Format context for inclusion in settings snapshot. 

354 Only includes essential metadata, not actual content. 

355 

356 Args: 

357 context: Full context dictionary 

358 

359 Returns: 

360 Minimal metadata for settings snapshot 

361 """ 

362 # Only include minimal metadata in settings snapshot 

363 # Settings snapshot should be for settings, not data 

364 return { 

365 "followup_metadata": { 

366 "parent_research_id": context.get("parent_research_id"), 

367 "is_followup": True, 

368 "has_context": bool(context.get("past_findings")), 

369 } 

370 } 

371 

372 def get_relevant_context_for_llm( 

373 self, context: Dict[str, Any], max_tokens: int = 2000 

374 ) -> str: 

375 """ 

376 Get a concise version of context for LLM prompts. 

377 

378 Args: 

379 context: Full context dictionary 

380 max_tokens: Approximate maximum tokens 

381 

382 Returns: 

383 Concise context string 

384 """ 

385 parts = [] 

386 

387 # Add original and follow-up queries 

388 parts.append(f"Original research: {context.get('original_query', '')}") 

389 parts.append( 

390 f"Follow-up question: {context.get('follow_up_query', '')}" 

391 ) 

392 

393 # Add summary 

394 if summary := context.get("summary"): 

395 parts.append(f"\nPrevious findings summary:\n{summary}") 

396 

397 # Add key entities 

398 if entities := context.get("key_entities"): 

399 parts.append(f"\nKey entities: {', '.join(entities[:5])}") 

400 

401 # Add source count 

402 if sources := context.get("past_sources"): 

403 parts.append(f"\nAvailable sources: {len(sources)}") 

404 

405 result = "\n".join(parts) 

406 

407 # Truncate if needed (rough approximation: 4 chars per token) 

408 max_chars = max_tokens * 4 

409 if len(result) > max_chars: 

410 result = result[:max_chars] + "..." 

411 

412 return result