Coverage for src/local_deep_research/advanced_search_system/knowledge/followup_context_manager.py: 97%

109 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Follow-up Context Manager 

3 

4Manages and processes past research context for follow-up questions. 

5This is a standalone class that doesn't inherit from BaseKnowledgeGenerator 

6to avoid implementing many abstract methods. 

7""" 

8 

9from typing import Dict, List, Any, Optional 

10from loguru import logger 

11 

12from langchain_core.language_models.chat_models import BaseChatModel 

13from ...utilities.search_utilities import remove_think_tags 

14 

15 

16class FollowUpContextHandler: 

17 """ 

18 Manages past research context for follow-up research. 

19 

20 This class handles: 

21 1. Loading and structuring past research data 

22 2. Summarizing findings for follow-up context 

23 3. Extracting relevant information for new searches 

24 4. Building comprehensive context for strategies 

25 """ 

26 

27 def __init__( 

28 self, model: BaseChatModel, settings_snapshot: Optional[Dict] = None 

29 ): 

30 """ 

31 Initialize the context manager. 

32 

33 Args: 

34 model: Language model for processing context 

35 settings_snapshot: Optional settings snapshot 

36 """ 

37 self.model = model 

38 self.settings_snapshot = settings_snapshot or {} 

39 self.past_research_cache = {} 

40 

41 def build_context( 

42 self, research_data: Dict[str, Any], follow_up_query: str 

43 ) -> Dict[str, Any]: 

44 """ 

45 Build comprehensive context from past research. 

46 

47 Args: 

48 research_data: Past research data including findings, sources, etc. 

49 follow_up_query: The follow-up question being asked 

50 

51 Returns: 

52 Structured context dictionary for follow-up research 

53 """ 

54 logger.info(f"Building context for follow-up: {follow_up_query}") 

55 

56 # Extract all components 

57 return { 

58 "parent_research_id": research_data.get("research_id", ""), 

59 "original_query": research_data.get("query", ""), 

60 "follow_up_query": follow_up_query, 

61 "past_findings": self._extract_findings(research_data), 

62 "past_sources": self._extract_sources(research_data), 

63 "key_entities": self._extract_entities(research_data), 

64 "summary": self._create_summary(research_data, follow_up_query), 

65 "report_content": research_data.get("report_content", ""), 

66 "formatted_findings": research_data.get("formatted_findings", ""), 

67 "all_links_of_system": research_data.get("all_links_of_system", []), 

68 "metadata": self._extract_metadata(research_data), 

69 } 

70 

71 def _extract_findings(self, research_data: Dict) -> str: 

72 """ 

73 Extract and format findings from past research. 

74 

75 Args: 

76 research_data: Past research data 

77 

78 Returns: 

79 Formatted findings string 

80 """ 

81 findings_parts = [] 

82 

83 # Check various possible locations for findings 

84 if formatted := research_data.get("formatted_findings"): 

85 findings_parts.append(formatted) 

86 

87 if report := research_data.get("report_content"): 

88 # Take first part of report if no formatted findings 

89 if not findings_parts: 

90 findings_parts.append(report[:2000]) 

91 

92 if not findings_parts: 

93 # Multi-turn chat supplies its condensed prior findings under the 

94 # "past_findings" key (it has no formatted_findings/report_content 

95 # of its own). Honor it before declaring nothing available — 

96 # otherwise the chat-built summary is silently dropped and the 

97 # follow-up prompt sees "No previous findings available". 

98 if past_findings := research_data.get("past_findings"): 

99 return past_findings 

100 return "No previous findings available" 

101 

102 return "\n\n".join(findings_parts) 

103 

104 def _extract_sources(self, research_data: Dict) -> List[Dict]: 

105 """ 

106 Extract and structure sources from past research. 

107 

108 Args: 

109 research_data: Past research data 

110 

111 Returns: 

112 List of source dictionaries 

113 """ 

114 sources = [] 

115 seen_urls = set() 

116 

117 # Check all possible source fields 

118 for field in ["resources", "all_links_of_system", "past_links"]: 

119 if field_sources := research_data.get(field, []): 

120 for source in field_sources: 

121 url = source.get("url", "") 

122 # Avoid duplicates by URL 

123 if url and url not in seen_urls: 

124 sources.append(source) 

125 seen_urls.add(url) 

126 elif not url: 

127 # Include sources without URLs (shouldn't happen but be safe) 

128 sources.append(source) 

129 

130 return sources 

131 

132 def _extract_entities(self, research_data: Dict) -> List[str]: 

133 """ 

134 Extract key entities from past research. 

135 

136 Args: 

137 research_data: Past research data 

138 

139 Returns: 

140 List of key entities 

141 """ 

142 findings = self._extract_findings(research_data) 

143 

144 if not findings or not self.model: 

145 return [] 

146 

147 prompt = f""" 

148Extract key entities (names, places, organizations, concepts) from these research findings: 

149 

150{findings[:2000]} 

151 

152Return up to 10 most important entities, one per line. 

153""" 

154 

155 try: 

156 response = self.model.invoke(prompt) 

157 entities = [ 

158 line.strip() 

159 for line in remove_think_tags(response.content) 

160 .strip() 

161 .split("\n") 

162 if line.strip() 

163 ] 

164 return entities[:10] 

165 except Exception: 

166 logger.warning("Failed to extract entities") 

167 return [] 

168 

169 def _create_summary(self, research_data: Dict, follow_up_query: str) -> str: 

170 """ 

171 Create a targeted summary of past research relevant to the follow-up question. 

172 This is used internally for building context. 

173 

174 Args: 

175 research_data: Past research data 

176 follow_up_query: The follow-up question 

177 

178 Returns: 

179 Targeted summary for context building 

180 """ 

181 findings = self._extract_findings(research_data) 

182 original_query = research_data.get("query", "") 

183 

184 # For internal context, create a brief targeted summary 

185 return self._generate_summary( 

186 findings=findings, 

187 query=follow_up_query, 

188 original_query=original_query, 

189 max_sentences=5, 

190 purpose="context", 

191 ) 

192 

193 def _extract_metadata(self, research_data: Dict) -> Dict: 

194 """ 

195 Extract metadata from past research. 

196 

197 Args: 

198 research_data: Past research data 

199 

200 Returns: 

201 Metadata dictionary 

202 """ 

203 return { 

204 "strategy": research_data.get("strategy", ""), 

205 "mode": research_data.get("mode", ""), 

206 "created_at": research_data.get("created_at", ""), 

207 "research_meta": research_data.get("research_meta", {}), 

208 } 

209 

210 def summarize_for_followup( 

211 self, findings: str, query: str, max_length: int = 1000 

212 ) -> str: 

213 """ 

214 Create a concise summary of findings for external use (e.g., in prompts). 

215 This creates a length-constrained summary suitable for inclusion in LLM prompts. 

216 

217 Args: 

218 findings: Past research findings 

219 query: Follow-up query 

220 max_length: Maximum length of summary in characters 

221 

222 Returns: 

223 Concise summary constrained to max_length 

224 """ 

225 # Use the shared summary generation with specific parameters for external use 

226 return self._generate_summary( 

227 findings=findings, 

228 query=query, 

229 original_query=None, 

230 max_sentences=max_length 

231 // 100, # Approximate sentences based on length 

232 purpose="prompt", 

233 max_length=max_length, 

234 ) 

235 

236 def _generate_summary( 

237 self, 

238 findings: str, 

239 query: str, 

240 original_query: Optional[str] = None, 

241 max_sentences: int = 5, 

242 purpose: str = "context", 

243 max_length: Optional[int] = None, 

244 ) -> str: 

245 """ 

246 Shared summary generation logic. 

247 

248 Args: 

249 findings: Research findings to summarize 

250 query: Follow-up query 

251 original_query: Original research query (optional) 

252 max_sentences: Maximum number of sentences 

253 purpose: Purpose of summary ("context" or "prompt") 

254 max_length: Maximum character length (optional) 

255 

256 Returns: 

257 Generated summary 

258 """ 

259 if not findings: 

260 return "" 

261 

262 # If findings are already short enough, return as-is 

263 if max_length and len(findings) <= max_length: 

264 return findings 

265 

266 if not self.model: 

267 # Fallback without model 

268 if max_length: 

269 return findings[:max_length] + "..." 

270 return findings[:500] + "..." 

271 

272 # Build prompt based on purpose 

273 if purpose == "context" and original_query: 

274 prompt = f""" 

275Create a brief summary of previous research findings that are relevant to this follow-up question: 

276 

277Original research question: "{original_query}" 

278Follow-up question: "{query}" 

279 

280Previous findings: 

281{findings[:3000]} 

282 

283Provide a {max_sentences}-sentence summary focusing on aspects relevant to the follow-up question. 

284""" 

285 else: 

286 prompt = f""" 

287Summarize these research findings in relation to the follow-up question: 

288 

289Follow-up question: "{query}" 

290 

291Findings: 

292{findings[:4000]} 

293 

294Create a summary of {max_sentences} sentences that captures the most relevant information. 

295""" 

296 

297 try: 

298 response = self.model.invoke(prompt) 

299 summary = remove_think_tags(response.content).strip() 

300 

301 # Apply length constraint if specified 

302 if max_length and len(summary) > max_length: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 summary = summary[:max_length] + "..." 

304 

305 return summary 

306 except Exception: 

307 logger.warning("Summary generation failed") 

308 # Fallback to truncation 

309 if max_length: 309 ↛ 311line 309 didn't jump to line 311 because the condition on line 309 was always true

310 return findings[:max_length] + "..." 

311 return findings[:500] + "..." 

312 

313 def identify_gaps( 

314 self, research_data: Dict, follow_up_query: str 

315 ) -> List[str]: 

316 """ 

317 Identify information gaps that the follow-up should address. 

318 

319 Args: 

320 research_data: Past research data 

321 follow_up_query: Follow-up question 

322 

323 Returns: 

324 List of identified gaps 

325 """ 

326 findings = self._extract_findings(research_data) 

327 

328 if not findings or not self.model: 

329 return [] 

330 

331 prompt = f""" 

332Based on the previous research and the follow-up question, identify information gaps: 

333 

334Previous research findings: 

335{findings[:2000]} 

336 

337Follow-up question: "{follow_up_query}" 

338 

339What specific information is missing or needs clarification? List up to 5 gaps, one per line. 

340""" 

341 

342 try: 

343 response = self.model.invoke(prompt) 

344 gaps = [ 

345 line.strip() 

346 for line in remove_think_tags(response.content) 

347 .strip() 

348 .split("\n") 

349 if line.strip() 

350 ] 

351 return gaps[:5] 

352 except Exception: 

353 logger.warning("Failed to identify gaps") 

354 return [] 

355 

356 def format_for_settings_snapshot( 

357 self, context: Dict[str, Any] 

358 ) -> Dict[str, Any]: 

359 """ 

360 Format context for inclusion in settings snapshot. 

361 Only includes essential metadata, not actual content. 

362 

363 Args: 

364 context: Full context dictionary 

365 

366 Returns: 

367 Minimal metadata for settings snapshot 

368 """ 

369 # Only include minimal metadata in settings snapshot 

370 # Settings snapshot should be for settings, not data 

371 return { 

372 "followup_metadata": { 

373 "parent_research_id": context.get("parent_research_id"), 

374 "is_followup": True, 

375 "has_context": bool(context.get("past_findings")), 

376 } 

377 } 

378 

379 def get_relevant_context_for_llm( 

380 self, context: Dict[str, Any], max_tokens: int = 2000 

381 ) -> str: 

382 """ 

383 Get a concise version of context for LLM prompts. 

384 

385 Args: 

386 context: Full context dictionary 

387 max_tokens: Approximate maximum tokens 

388 

389 Returns: 

390 Concise context string 

391 """ 

392 parts = [] 

393 

394 # Add original and follow-up queries 

395 parts.append(f"Original research: {context.get('original_query', '')}") 

396 parts.append( 

397 f"Follow-up question: {context.get('follow_up_query', '')}" 

398 ) 

399 

400 # Add summary 

401 if summary := context.get("summary"): 

402 parts.append(f"\nPrevious findings summary:\n{summary}") 

403 

404 # Add key entities 

405 if entities := context.get("key_entities"): 

406 parts.append(f"\nKey entities: {', '.join(entities[:5])}") 

407 

408 # Add source count 

409 if sources := context.get("past_sources"): 

410 parts.append(f"\nAvailable sources: {len(sources)}") 

411 

412 result = "\n".join(parts) 

413 

414 # Truncate if needed (rough approximation: 4 chars per token) 

415 max_chars = max_tokens * 4 

416 if len(result) > max_chars: 

417 result = result[:max_chars] + "..." 

418 

419 return result