Coverage for src/local_deep_research/advanced_search_system/knowledge/followup_context

1"""

2Follow-up Context Manager

4Manages and processes past research context for follow-up questions.

5This is a standalone class that doesn't inherit from BaseKnowledgeGenerator

6to avoid implementing many abstract methods.

7"""

9from typing import Dict, List, Any, Optional

10from loguru import logger

12from langchain_core.language_models.chat_models import BaseChatModel

13from ...utilities.search_utilities import remove_think_tags

16class FollowUpContextHandler:

17 """

18 Manages past research context for follow-up research.

20 This class handles:

21 1. Loading and structuring past research data

22 2. Summarizing findings for follow-up context

23 3. Extracting relevant information for new searches

24 4. Building comprehensive context for strategies

25 """

27 def __init__(

28 self, model: BaseChatModel, settings_snapshot: Optional[Dict] = None

29 ):

30 """

31 Initialize the context manager.

33 Args:

34 model: Language model for processing context

35 settings_snapshot: Optional settings snapshot

36 """

37 self.model = model

38 self.settings_snapshot = settings_snapshot or {}

39 self.past_research_cache = {}

41 def build_context(

42 self, research_data: Dict[str, Any], follow_up_query: str

43 ) -> Dict[str, Any]:

44 """

45 Build comprehensive context from past research.

47 Args:

48 research_data: Past research data including findings, sources, etc.

49 follow_up_query: The follow-up question being asked

51 Returns:

52 Structured context dictionary for follow-up research

53 """

54 logger.info(f"Building context for follow-up: {follow_up_query}")

56 # Extract all components

57 context = {

58 "parent_research_id": research_data.get("research_id", ""),

59 "original_query": research_data.get("query", ""),

60 "follow_up_query": follow_up_query,

61 "past_findings": self._extract_findings(research_data),

62 "past_sources": self._extract_sources(research_data),

63 "key_entities": self._extract_entities(research_data),

64 "summary": self._create_summary(research_data, follow_up_query),

65 "report_content": research_data.get("report_content", ""),

66 "formatted_findings": research_data.get("formatted_findings", ""),

67 "all_links_of_system": research_data.get("all_links_of_system", []),

68 "metadata": self._extract_metadata(research_data),

69 }

71 return context

73 def _extract_findings(self, research_data: Dict) -> str:

74 """

75 Extract and format findings from past research.

77 Args:

78 research_data: Past research data

80 Returns:

81 Formatted findings string

82 """

83 findings_parts = []

85 # Check various possible locations for findings

86 if formatted := research_data.get("formatted_findings"):

87 findings_parts.append(formatted)

89 if report := research_data.get("report_content"):

90 # Take first part of report if no formatted findings

91 if not findings_parts:

92 findings_parts.append(report[:2000])

94 if not findings_parts:

95 return "No previous findings available"

97 combined = "\n\n".join(findings_parts)

98 return combined

100 def _extract_sources(self, research_data: Dict) -> List[Dict]:

101 """

102 Extract and structure sources from past research.

103

104 Args:

105 research_data: Past research data

106

107 Returns:

108 List of source dictionaries

109 """

110 sources = []

111 seen_urls = set()

112

113 # Check all possible source fields

114 for field in ["resources", "all_links_of_system", "past_links"]:

115 if field_sources := research_data.get(field, []):

116 for source in field_sources:

117 url = source.get("url", "")

118 # Avoid duplicates by URL

119 if url and url not in seen_urls:

120 sources.append(source)

121 seen_urls.add(url)

122 elif not url:

123 # Include sources without URLs (shouldn't happen but be safe)

124 sources.append(source)

125

126 return sources

127

128 def _extract_entities(self, research_data: Dict) -> List[str]:

129 """

130 Extract key entities from past research.

131

132 Args:

133 research_data: Past research data

134

135 Returns:

136 List of key entities

137 """

138 findings = self._extract_findings(research_data)

139

140 if not findings or not self.model:

141 return []

142

143 prompt = f"""

144Extract key entities (names, places, organizations, concepts) from these research findings:

145

146{findings[:2000]}

147

148Return up to 10 most important entities, one per line.

149"""

150

151 try:

152 response = self.model.invoke(prompt)

153 entities = [

154 line.strip()

155 for line in remove_think_tags(response.content)

156 .strip()

157 .split("\n")

158 if line.strip()

159 ]

160 return entities[:10]

161 except Exception as e:

162 logger.warning(f"Failed to extract entities: {e}")

163 return []

164

165 def _create_summary(self, research_data: Dict, follow_up_query: str) -> str:

166 """

167 Create a targeted summary of past research relevant to the follow-up question.

168 This is used internally for building context.

169

170 Args:

171 research_data: Past research data

172 follow_up_query: The follow-up question

173

174 Returns:

175 Targeted summary for context building

176 """

177 findings = self._extract_findings(research_data)

178 original_query = research_data.get("query", "")

179

180 # For internal context, create a brief targeted summary

181 return self._generate_summary(

182 findings=findings,

183 query=follow_up_query,

184 original_query=original_query,

185 max_sentences=5,

186 purpose="context",

187 )

188

189 def _extract_metadata(self, research_data: Dict) -> Dict:

190 """

191 Extract metadata from past research.

192

193 Args:

194 research_data: Past research data

195

196 Returns:

197 Metadata dictionary

198 """

199 return {

200 "strategy": research_data.get("strategy", ""),

201 "mode": research_data.get("mode", ""),

202 "created_at": research_data.get("created_at", ""),

203 "research_meta": research_data.get("research_meta", {}),

204 }

205

206 def summarize_for_followup(

207 self, findings: str, query: str, max_length: int = 1000

208 ) -> str:

209 """

210 Create a concise summary of findings for external use (e.g., in prompts).

211 This creates a length-constrained summary suitable for inclusion in LLM prompts.

212

213 Args:

214 findings: Past research findings

215 query: Follow-up query

216 max_length: Maximum length of summary in characters

217

218 Returns:

219 Concise summary constrained to max_length

220 """

221 # Use the shared summary generation with specific parameters for external use

222 return self._generate_summary(

223 findings=findings,

224 query=query,

225 original_query=None,

226 max_sentences=max_length

227 // 100, # Approximate sentences based on length

228 purpose="prompt",

229 max_length=max_length,

230 )

231

232 def _generate_summary(

233 self,

234 findings: str,

235 query: str,

236 original_query: Optional[str] = None,

237 max_sentences: int = 5,

238 purpose: str = "context",

239 max_length: Optional[int] = None,

240 ) -> str:

241 """

242 Shared summary generation logic.

243

244 Args:

245 findings: Research findings to summarize

246 query: Follow-up query

247 original_query: Original research query (optional)

248 max_sentences: Maximum number of sentences

249 purpose: Purpose of summary ("context" or "prompt")

250 max_length: Maximum character length (optional)

251

252 Returns:

253 Generated summary

254 """

255 if not findings:

256 return ""

257

258 # If findings are already short enough, return as-is

259 if max_length and len(findings) <= max_length:

260 return findings

261

262 if not self.model:

263 # Fallback without model

264 if max_length: 264 ↛ 266line 264 didn't jump to line 266 because the condition on line 264 was always true

265 return findings[:max_length] + "..."

266 return findings[:500] + "..."

267

268 # Build prompt based on purpose

269 if purpose == "context" and original_query:

270 prompt = f"""

271Create a brief summary of previous research findings that are relevant to this follow-up question:

272

273Original research question: "{original_query}"

274Follow-up question: "{query}"

275

276Previous findings:

277{findings[:3000]}

278

279Provide a {max_sentences}-sentence summary focusing on aspects relevant to the follow-up question.

280"""

281 else:

282 prompt = f"""

283Summarize these research findings in relation to the follow-up question:

284

285Follow-up question: "{query}"

286

287Findings:

288{findings[:4000]}

289

290Create a summary of {max_sentences} sentences that captures the most relevant information.

291"""

292

293 try:

294 response = self.model.invoke(prompt)

295 summary = remove_think_tags(response.content).strip()

296

297 # Apply length constraint if specified

298 if max_length and len(summary) > max_length: 298 ↛ 299line 298 didn't jump to line 299 because the condition on line 298 was never true

299 summary = summary[:max_length] + "..."

300

301 return summary

302 except Exception as e:

303 logger.warning(f"Summary generation failed: {e}")

304 # Fallback to truncation

305 if max_length: 305 ↛ 307line 305 didn't jump to line 307 because the condition on line 305 was always true

306 return findings[:max_length] + "..."

307 return findings[:500] + "..."

308

309 def identify_gaps(

310 self, research_data: Dict, follow_up_query: str

311 ) -> List[str]:

312 """

313 Identify information gaps that the follow-up should address.

314

315 Args:

316 research_data: Past research data

317 follow_up_query: Follow-up question

318

319 Returns:

320 List of identified gaps

321 """

322 findings = self._extract_findings(research_data)

323

324 if not findings or not self.model:

325 return []

326

327 prompt = f"""

328Based on the previous research and the follow-up question, identify information gaps:

329

330Previous research findings:

331{findings[:2000]}

332

333Follow-up question: "{follow_up_query}"

334

335What specific information is missing or needs clarification? List up to 5 gaps, one per line.

336"""

337

338 try:

339 response = self.model.invoke(prompt)

340 gaps = [

341 line.strip()

342 for line in remove_think_tags(response.content)

343 .strip()

344 .split("\n")

345 if line.strip()

346 ]

347 return gaps[:5]

348 except Exception as e:

349 logger.warning(f"Failed to identify gaps: {e}")

350 return []

351

352 def format_for_settings_snapshot(

353 self, context: Dict[str, Any]

354 ) -> Dict[str, Any]:

355 """

356 Format context for inclusion in settings snapshot.

357 Only includes essential metadata, not actual content.

358

359 Args:

360 context: Full context dictionary

361

362 Returns:

363 Minimal metadata for settings snapshot

364 """

365 # Only include minimal metadata in settings snapshot

366 # Settings snapshot should be for settings, not data

367 return {

368 "followup_metadata": {

369 "parent_research_id": context.get("parent_research_id"),

370 "is_followup": True,

371 "has_context": bool(context.get("past_findings")),

372 }

373 }

374

375 def get_relevant_context_for_llm(

376 self, context: Dict[str, Any], max_tokens: int = 2000

377 ) -> str:

378 """

379 Get a concise version of context for LLM prompts.

380

381 Args:

382 context: Full context dictionary

383 max_tokens: Approximate maximum tokens

384

385 Returns:

386 Concise context string

387 """

388 parts = []

389

390 # Add original and follow-up queries

391 parts.append(f"Original research: {context.get('original_query', '')}")

392 parts.append(

393 f"Follow-up question: {context.get('follow_up_query', '')}"

394 )

395

396 # Add summary

397 if summary := context.get("summary"):

398 parts.append(f"\nPrevious findings summary:\n{summary}")

399

400 # Add key entities

401 if entities := context.get("key_entities"):

402 parts.append(f"\nKey entities: {', '.join(entities[:5])}")

403

404 # Add source count

405 if sources := context.get("past_sources"):

406 parts.append(f"\nAvailable sources: {len(sources)}")

407

408 result = "\n".join(parts)

409

410 # Truncate if needed (rough approximation: 4 chars per token)

411 max_chars = max_tokens * 4

412 if len(result) > max_chars:

413 result = result[:max_chars] + "..."

414

415 return result

Coverage for src / local_deep_research / advanced_search_system / knowledge / followup_context_manager.py: 96%

109 statements