Coverage for src / local_deep_research / advanced_search_system / knowledge / followup_context_manager.py: 11%

109 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Follow-up Context Manager 

3 

4Manages and processes past research context for follow-up questions. 

5This is a standalone class that doesn't inherit from BaseKnowledgeGenerator 

6to avoid implementing many abstract methods. 

7""" 

8 

9from typing import Dict, List, Any, Optional 

10from loguru import logger 

11 

12from langchain_core.language_models.chat_models import BaseChatModel 

13from ...utilities.search_utilities import remove_think_tags 

14 

15 

16class FollowUpContextHandler: 

17 """ 

18 Manages past research context for follow-up research. 

19 

20 This class handles: 

21 1. Loading and structuring past research data 

22 2. Summarizing findings for follow-up context 

23 3. Extracting relevant information for new searches 

24 4. Building comprehensive context for strategies 

25 """ 

26 

27 def __init__( 

28 self, model: BaseChatModel, settings_snapshot: Optional[Dict] = None 

29 ): 

30 """ 

31 Initialize the context manager. 

32 

33 Args: 

34 model: Language model for processing context 

35 settings_snapshot: Optional settings snapshot 

36 """ 

37 self.model = model 

38 self.settings_snapshot = settings_snapshot or {} 

39 self.past_research_cache = {} 

40 

41 def build_context( 

42 self, research_data: Dict[str, Any], follow_up_query: str 

43 ) -> Dict[str, Any]: 

44 """ 

45 Build comprehensive context from past research. 

46 

47 Args: 

48 research_data: Past research data including findings, sources, etc. 

49 follow_up_query: The follow-up question being asked 

50 

51 Returns: 

52 Structured context dictionary for follow-up research 

53 """ 

54 logger.info(f"Building context for follow-up: {follow_up_query}") 

55 

56 # Extract all components 

57 context = { 

58 "parent_research_id": research_data.get("research_id", ""), 

59 "original_query": research_data.get("query", ""), 

60 "follow_up_query": follow_up_query, 

61 "past_findings": self._extract_findings(research_data), 

62 "past_sources": self._extract_sources(research_data), 

63 "key_entities": self._extract_entities(research_data), 

64 "summary": self._create_summary(research_data, follow_up_query), 

65 "report_content": research_data.get("report_content", ""), 

66 "formatted_findings": research_data.get("formatted_findings", ""), 

67 "all_links_of_system": research_data.get("all_links_of_system", []), 

68 "metadata": self._extract_metadata(research_data), 

69 } 

70 

71 return context 

72 

73 def _extract_findings(self, research_data: Dict) -> str: 

74 """ 

75 Extract and format findings from past research. 

76 

77 Args: 

78 research_data: Past research data 

79 

80 Returns: 

81 Formatted findings string 

82 """ 

83 findings_parts = [] 

84 

85 # Check various possible locations for findings 

86 if formatted := research_data.get("formatted_findings"): 

87 findings_parts.append(formatted) 

88 

89 if report := research_data.get("report_content"): 

90 # Take first part of report if no formatted findings 

91 if not findings_parts: 

92 findings_parts.append(report[:2000]) 

93 

94 if not findings_parts: 

95 return "No previous findings available" 

96 

97 combined = "\n\n".join(findings_parts) 

98 return combined 

99 

100 def _extract_sources(self, research_data: Dict) -> List[Dict]: 

101 """ 

102 Extract and structure sources from past research. 

103 

104 Args: 

105 research_data: Past research data 

106 

107 Returns: 

108 List of source dictionaries 

109 """ 

110 sources = [] 

111 seen_urls = set() 

112 

113 # Check all possible source fields 

114 for field in ["resources", "all_links_of_system", "past_links"]: 

115 if field_sources := research_data.get(field, []): 

116 for source in field_sources: 

117 url = source.get("url", "") 

118 # Avoid duplicates by URL 

119 if url and url not in seen_urls: 

120 sources.append(source) 

121 seen_urls.add(url) 

122 elif not url: 

123 # Include sources without URLs (shouldn't happen but be safe) 

124 sources.append(source) 

125 

126 return sources 

127 

128 def _extract_entities(self, research_data: Dict) -> List[str]: 

129 """ 

130 Extract key entities from past research. 

131 

132 Args: 

133 research_data: Past research data 

134 

135 Returns: 

136 List of key entities 

137 """ 

138 findings = self._extract_findings(research_data) 

139 

140 if not findings or not self.model: 

141 return [] 

142 

143 prompt = f""" 

144Extract key entities (names, places, organizations, concepts) from these research findings: 

145 

146{findings[:2000]} 

147 

148Return up to 10 most important entities, one per line. 

149""" 

150 

151 try: 

152 response = self.model.invoke(prompt) 

153 entities = [ 

154 line.strip() 

155 for line in remove_think_tags(response.content) 

156 .strip() 

157 .split("\n") 

158 if line.strip() 

159 ] 

160 return entities[:10] 

161 except Exception as e: 

162 logger.warning(f"Failed to extract entities: {e}") 

163 return [] 

164 

165 def _create_summary(self, research_data: Dict, follow_up_query: str) -> str: 

166 """ 

167 Create a targeted summary of past research relevant to the follow-up question. 

168 This is used internally for building context. 

169 

170 Args: 

171 research_data: Past research data 

172 follow_up_query: The follow-up question 

173 

174 Returns: 

175 Targeted summary for context building 

176 """ 

177 findings = self._extract_findings(research_data) 

178 original_query = research_data.get("query", "") 

179 

180 # For internal context, create a brief targeted summary 

181 return self._generate_summary( 

182 findings=findings, 

183 query=follow_up_query, 

184 original_query=original_query, 

185 max_sentences=5, 

186 purpose="context", 

187 ) 

188 

189 def _extract_metadata(self, research_data: Dict) -> Dict: 

190 """ 

191 Extract metadata from past research. 

192 

193 Args: 

194 research_data: Past research data 

195 

196 Returns: 

197 Metadata dictionary 

198 """ 

199 return { 

200 "strategy": research_data.get("strategy", ""), 

201 "mode": research_data.get("mode", ""), 

202 "created_at": research_data.get("created_at", ""), 

203 "research_meta": research_data.get("research_meta", {}), 

204 } 

205 

206 def summarize_for_followup( 

207 self, findings: str, query: str, max_length: int = 1000 

208 ) -> str: 

209 """ 

210 Create a concise summary of findings for external use (e.g., in prompts). 

211 This creates a length-constrained summary suitable for inclusion in LLM prompts. 

212 

213 Args: 

214 findings: Past research findings 

215 query: Follow-up query 

216 max_length: Maximum length of summary in characters 

217 

218 Returns: 

219 Concise summary constrained to max_length 

220 """ 

221 # Use the shared summary generation with specific parameters for external use 

222 return self._generate_summary( 

223 findings=findings, 

224 query=query, 

225 original_query=None, 

226 max_sentences=max_length 

227 // 100, # Approximate sentences based on length 

228 purpose="prompt", 

229 max_length=max_length, 

230 ) 

231 

232 def _generate_summary( 

233 self, 

234 findings: str, 

235 query: str, 

236 original_query: Optional[str] = None, 

237 max_sentences: int = 5, 

238 purpose: str = "context", 

239 max_length: Optional[int] = None, 

240 ) -> str: 

241 """ 

242 Shared summary generation logic. 

243 

244 Args: 

245 findings: Research findings to summarize 

246 query: Follow-up query 

247 original_query: Original research query (optional) 

248 max_sentences: Maximum number of sentences 

249 purpose: Purpose of summary ("context" or "prompt") 

250 max_length: Maximum character length (optional) 

251 

252 Returns: 

253 Generated summary 

254 """ 

255 if not findings: 

256 return "" 

257 

258 # If findings are already short enough, return as-is 

259 if max_length and len(findings) <= max_length: 

260 return findings 

261 

262 if not self.model: 

263 # Fallback without model 

264 if max_length: 

265 return findings[:max_length] + "..." 

266 return findings[:500] + "..." 

267 

268 # Build prompt based on purpose 

269 if purpose == "context" and original_query: 

270 prompt = f""" 

271Create a brief summary of previous research findings that are relevant to this follow-up question: 

272 

273Original research question: "{original_query}" 

274Follow-up question: "{query}" 

275 

276Previous findings: 

277{findings[:3000]} 

278 

279Provide a {max_sentences}-sentence summary focusing on aspects relevant to the follow-up question. 

280""" 

281 else: 

282 prompt = f""" 

283Summarize these research findings in relation to the follow-up question: 

284 

285Follow-up question: "{query}" 

286 

287Findings: 

288{findings[:4000]} 

289 

290Create a summary of {max_sentences} sentences that captures the most relevant information. 

291""" 

292 

293 try: 

294 response = self.model.invoke(prompt) 

295 summary = remove_think_tags(response.content).strip() 

296 

297 # Apply length constraint if specified 

298 if max_length and len(summary) > max_length: 

299 summary = summary[:max_length] + "..." 

300 

301 return summary 

302 except Exception as e: 

303 logger.warning(f"Summary generation failed: {e}") 

304 # Fallback to truncation 

305 if max_length: 

306 return findings[:max_length] + "..." 

307 return findings[:500] + "..." 

308 

309 def identify_gaps( 

310 self, research_data: Dict, follow_up_query: str 

311 ) -> List[str]: 

312 """ 

313 Identify information gaps that the follow-up should address. 

314 

315 Args: 

316 research_data: Past research data 

317 follow_up_query: Follow-up question 

318 

319 Returns: 

320 List of identified gaps 

321 """ 

322 findings = self._extract_findings(research_data) 

323 

324 if not findings or not self.model: 

325 return [] 

326 

327 prompt = f""" 

328Based on the previous research and the follow-up question, identify information gaps: 

329 

330Previous research findings: 

331{findings[:2000]} 

332 

333Follow-up question: "{follow_up_query}" 

334 

335What specific information is missing or needs clarification? List up to 5 gaps, one per line. 

336""" 

337 

338 try: 

339 response = self.model.invoke(prompt) 

340 gaps = [ 

341 line.strip() 

342 for line in remove_think_tags(response.content) 

343 .strip() 

344 .split("\n") 

345 if line.strip() 

346 ] 

347 return gaps[:5] 

348 except Exception as e: 

349 logger.warning(f"Failed to identify gaps: {e}") 

350 return [] 

351 

352 def format_for_settings_snapshot( 

353 self, context: Dict[str, Any] 

354 ) -> Dict[str, Any]: 

355 """ 

356 Format context for inclusion in settings snapshot. 

357 Only includes essential metadata, not actual content. 

358 

359 Args: 

360 context: Full context dictionary 

361 

362 Returns: 

363 Minimal metadata for settings snapshot 

364 """ 

365 # Only include minimal metadata in settings snapshot 

366 # Settings snapshot should be for settings, not data 

367 return { 

368 "followup_metadata": { 

369 "parent_research_id": context.get("parent_research_id"), 

370 "is_followup": True, 

371 "has_context": bool(context.get("past_findings")), 

372 } 

373 } 

374 

375 def get_relevant_context_for_llm( 

376 self, context: Dict[str, Any], max_tokens: int = 2000 

377 ) -> str: 

378 """ 

379 Get a concise version of context for LLM prompts. 

380 

381 Args: 

382 context: Full context dictionary 

383 max_tokens: Approximate maximum tokens 

384 

385 Returns: 

386 Concise context string 

387 """ 

388 parts = [] 

389 

390 # Add original and follow-up queries 

391 parts.append(f"Original research: {context.get('original_query', '')}") 

392 parts.append( 

393 f"Follow-up question: {context.get('follow_up_query', '')}" 

394 ) 

395 

396 # Add summary 

397 if summary := context.get("summary"): 

398 parts.append(f"\nPrevious findings summary:\n{summary}") 

399 

400 # Add key entities 

401 if entities := context.get("key_entities"): 

402 parts.append(f"\nKey entities: {', '.join(entities[:5])}") 

403 

404 # Add source count 

405 if sources := context.get("past_sources"): 

406 parts.append(f"\nAvailable sources: {len(sources)}") 

407 

408 result = "\n".join(parts) 

409 

410 # Truncate if needed (rough approximation: 4 chars per token) 

411 max_chars = max_tokens * 4 

412 if len(result) > max_chars: 

413 result = result[:max_chars] + "..." 

414 

415 return result