Coverage for src/local_deep_research/advanced_search_system/knowledge/followup_context_manager.py: 97%
109 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Follow-up Context Manager
4Manages and processes past research context for follow-up questions.
5This is a standalone class that doesn't inherit from BaseKnowledgeGenerator
6to avoid implementing many abstract methods.
7"""
9from typing import Dict, List, Any, Optional
10from loguru import logger
12from langchain_core.language_models.chat_models import BaseChatModel
13from ...utilities.search_utilities import remove_think_tags
16class FollowUpContextHandler:
17 """
18 Manages past research context for follow-up research.
20 This class handles:
21 1. Loading and structuring past research data
22 2. Summarizing findings for follow-up context
23 3. Extracting relevant information for new searches
24 4. Building comprehensive context for strategies
25 """
27 def __init__(
28 self, model: BaseChatModel, settings_snapshot: Optional[Dict] = None
29 ):
30 """
31 Initialize the context manager.
33 Args:
34 model: Language model for processing context
35 settings_snapshot: Optional settings snapshot
36 """
37 self.model = model
38 self.settings_snapshot = settings_snapshot or {}
39 self.past_research_cache = {}
41 def build_context(
42 self, research_data: Dict[str, Any], follow_up_query: str
43 ) -> Dict[str, Any]:
44 """
45 Build comprehensive context from past research.
47 Args:
48 research_data: Past research data including findings, sources, etc.
49 follow_up_query: The follow-up question being asked
51 Returns:
52 Structured context dictionary for follow-up research
53 """
54 logger.info(f"Building context for follow-up: {follow_up_query}")
56 # Extract all components
57 return {
58 "parent_research_id": research_data.get("research_id", ""),
59 "original_query": research_data.get("query", ""),
60 "follow_up_query": follow_up_query,
61 "past_findings": self._extract_findings(research_data),
62 "past_sources": self._extract_sources(research_data),
63 "key_entities": self._extract_entities(research_data),
64 "summary": self._create_summary(research_data, follow_up_query),
65 "report_content": research_data.get("report_content", ""),
66 "formatted_findings": research_data.get("formatted_findings", ""),
67 "all_links_of_system": research_data.get("all_links_of_system", []),
68 "metadata": self._extract_metadata(research_data),
69 }
71 def _extract_findings(self, research_data: Dict) -> str:
72 """
73 Extract and format findings from past research.
75 Args:
76 research_data: Past research data
78 Returns:
79 Formatted findings string
80 """
81 findings_parts = []
83 # Check various possible locations for findings
84 if formatted := research_data.get("formatted_findings"):
85 findings_parts.append(formatted)
87 if report := research_data.get("report_content"):
88 # Take first part of report if no formatted findings
89 if not findings_parts:
90 findings_parts.append(report[:2000])
92 if not findings_parts:
93 # Multi-turn chat supplies its condensed prior findings under the
94 # "past_findings" key (it has no formatted_findings/report_content
95 # of its own). Honor it before declaring nothing available —
96 # otherwise the chat-built summary is silently dropped and the
97 # follow-up prompt sees "No previous findings available".
98 if past_findings := research_data.get("past_findings"):
99 return past_findings
100 return "No previous findings available"
102 return "\n\n".join(findings_parts)
104 def _extract_sources(self, research_data: Dict) -> List[Dict]:
105 """
106 Extract and structure sources from past research.
108 Args:
109 research_data: Past research data
111 Returns:
112 List of source dictionaries
113 """
114 sources = []
115 seen_urls = set()
117 # Check all possible source fields
118 for field in ["resources", "all_links_of_system", "past_links"]:
119 if field_sources := research_data.get(field, []):
120 for source in field_sources:
121 url = source.get("url", "")
122 # Avoid duplicates by URL
123 if url and url not in seen_urls:
124 sources.append(source)
125 seen_urls.add(url)
126 elif not url:
127 # Include sources without URLs (shouldn't happen but be safe)
128 sources.append(source)
130 return sources
132 def _extract_entities(self, research_data: Dict) -> List[str]:
133 """
134 Extract key entities from past research.
136 Args:
137 research_data: Past research data
139 Returns:
140 List of key entities
141 """
142 findings = self._extract_findings(research_data)
144 if not findings or not self.model:
145 return []
147 prompt = f"""
148Extract key entities (names, places, organizations, concepts) from these research findings:
150{findings[:2000]}
152Return up to 10 most important entities, one per line.
153"""
155 try:
156 response = self.model.invoke(prompt)
157 entities = [
158 line.strip()
159 for line in remove_think_tags(response.content)
160 .strip()
161 .split("\n")
162 if line.strip()
163 ]
164 return entities[:10]
165 except Exception:
166 logger.warning("Failed to extract entities")
167 return []
169 def _create_summary(self, research_data: Dict, follow_up_query: str) -> str:
170 """
171 Create a targeted summary of past research relevant to the follow-up question.
172 This is used internally for building context.
174 Args:
175 research_data: Past research data
176 follow_up_query: The follow-up question
178 Returns:
179 Targeted summary for context building
180 """
181 findings = self._extract_findings(research_data)
182 original_query = research_data.get("query", "")
184 # For internal context, create a brief targeted summary
185 return self._generate_summary(
186 findings=findings,
187 query=follow_up_query,
188 original_query=original_query,
189 max_sentences=5,
190 purpose="context",
191 )
193 def _extract_metadata(self, research_data: Dict) -> Dict:
194 """
195 Extract metadata from past research.
197 Args:
198 research_data: Past research data
200 Returns:
201 Metadata dictionary
202 """
203 return {
204 "strategy": research_data.get("strategy", ""),
205 "mode": research_data.get("mode", ""),
206 "created_at": research_data.get("created_at", ""),
207 "research_meta": research_data.get("research_meta", {}),
208 }
210 def summarize_for_followup(
211 self, findings: str, query: str, max_length: int = 1000
212 ) -> str:
213 """
214 Create a concise summary of findings for external use (e.g., in prompts).
215 This creates a length-constrained summary suitable for inclusion in LLM prompts.
217 Args:
218 findings: Past research findings
219 query: Follow-up query
220 max_length: Maximum length of summary in characters
222 Returns:
223 Concise summary constrained to max_length
224 """
225 # Use the shared summary generation with specific parameters for external use
226 return self._generate_summary(
227 findings=findings,
228 query=query,
229 original_query=None,
230 max_sentences=max_length
231 // 100, # Approximate sentences based on length
232 purpose="prompt",
233 max_length=max_length,
234 )
236 def _generate_summary(
237 self,
238 findings: str,
239 query: str,
240 original_query: Optional[str] = None,
241 max_sentences: int = 5,
242 purpose: str = "context",
243 max_length: Optional[int] = None,
244 ) -> str:
245 """
246 Shared summary generation logic.
248 Args:
249 findings: Research findings to summarize
250 query: Follow-up query
251 original_query: Original research query (optional)
252 max_sentences: Maximum number of sentences
253 purpose: Purpose of summary ("context" or "prompt")
254 max_length: Maximum character length (optional)
256 Returns:
257 Generated summary
258 """
259 if not findings:
260 return ""
262 # If findings are already short enough, return as-is
263 if max_length and len(findings) <= max_length:
264 return findings
266 if not self.model:
267 # Fallback without model
268 if max_length:
269 return findings[:max_length] + "..."
270 return findings[:500] + "..."
272 # Build prompt based on purpose
273 if purpose == "context" and original_query:
274 prompt = f"""
275Create a brief summary of previous research findings that are relevant to this follow-up question:
277Original research question: "{original_query}"
278Follow-up question: "{query}"
280Previous findings:
281{findings[:3000]}
283Provide a {max_sentences}-sentence summary focusing on aspects relevant to the follow-up question.
284"""
285 else:
286 prompt = f"""
287Summarize these research findings in relation to the follow-up question:
289Follow-up question: "{query}"
291Findings:
292{findings[:4000]}
294Create a summary of {max_sentences} sentences that captures the most relevant information.
295"""
297 try:
298 response = self.model.invoke(prompt)
299 summary = remove_think_tags(response.content).strip()
301 # Apply length constraint if specified
302 if max_length and len(summary) > max_length: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true
303 summary = summary[:max_length] + "..."
305 return summary
306 except Exception:
307 logger.warning("Summary generation failed")
308 # Fallback to truncation
309 if max_length: 309 ↛ 311line 309 didn't jump to line 311 because the condition on line 309 was always true
310 return findings[:max_length] + "..."
311 return findings[:500] + "..."
313 def identify_gaps(
314 self, research_data: Dict, follow_up_query: str
315 ) -> List[str]:
316 """
317 Identify information gaps that the follow-up should address.
319 Args:
320 research_data: Past research data
321 follow_up_query: Follow-up question
323 Returns:
324 List of identified gaps
325 """
326 findings = self._extract_findings(research_data)
328 if not findings or not self.model:
329 return []
331 prompt = f"""
332Based on the previous research and the follow-up question, identify information gaps:
334Previous research findings:
335{findings[:2000]}
337Follow-up question: "{follow_up_query}"
339What specific information is missing or needs clarification? List up to 5 gaps, one per line.
340"""
342 try:
343 response = self.model.invoke(prompt)
344 gaps = [
345 line.strip()
346 for line in remove_think_tags(response.content)
347 .strip()
348 .split("\n")
349 if line.strip()
350 ]
351 return gaps[:5]
352 except Exception:
353 logger.warning("Failed to identify gaps")
354 return []
356 def format_for_settings_snapshot(
357 self, context: Dict[str, Any]
358 ) -> Dict[str, Any]:
359 """
360 Format context for inclusion in settings snapshot.
361 Only includes essential metadata, not actual content.
363 Args:
364 context: Full context dictionary
366 Returns:
367 Minimal metadata for settings snapshot
368 """
369 # Only include minimal metadata in settings snapshot
370 # Settings snapshot should be for settings, not data
371 return {
372 "followup_metadata": {
373 "parent_research_id": context.get("parent_research_id"),
374 "is_followup": True,
375 "has_context": bool(context.get("past_findings")),
376 }
377 }
379 def get_relevant_context_for_llm(
380 self, context: Dict[str, Any], max_tokens: int = 2000
381 ) -> str:
382 """
383 Get a concise version of context for LLM prompts.
385 Args:
386 context: Full context dictionary
387 max_tokens: Approximate maximum tokens
389 Returns:
390 Concise context string
391 """
392 parts = []
394 # Add original and follow-up queries
395 parts.append(f"Original research: {context.get('original_query', '')}")
396 parts.append(
397 f"Follow-up question: {context.get('follow_up_query', '')}"
398 )
400 # Add summary
401 if summary := context.get("summary"):
402 parts.append(f"\nPrevious findings summary:\n{summary}")
404 # Add key entities
405 if entities := context.get("key_entities"):
406 parts.append(f"\nKey entities: {', '.join(entities[:5])}")
408 # Add source count
409 if sources := context.get("past_sources"):
410 parts.append(f"\nAvailable sources: {len(sources)}")
412 result = "\n".join(parts)
414 # Truncate if needed (rough approximation: 4 chars per token)
415 max_chars = max_tokens * 4
416 if len(result) > max_chars:
417 result = result[:max_chars] + "..."
419 return result