Coverage for src / local_deep_research / citation_handlers / precision_extraction_handler.py: 97%

204 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Precision Extraction Citation Handler 

3 

4This handler focuses on extracting precise, complete answers for SimpleQA-style questions. 

5It includes specialized extractors for: 

6- Full names (including middle names) 

7- Single answers when only one is requested 

8- Dimension-aware measurements 

9- Specific entities without extra information 

10""" 

11 

12import re 

13from datetime import datetime, timezone 

14from typing import Any, Dict, List, Union 

15 

16from loguru import logger 

17 

18from .base_citation_handler import BaseCitationHandler 

19 

20 

21class PrecisionExtractionHandler(BaseCitationHandler): 

22 """Citation handler optimized for precise answer extraction.""" 

23 

24 def __init__(self, *args, **kwargs): 

25 super().__init__(*args, **kwargs) 

26 

27 # Answer type patterns 

28 self.answer_patterns = { 

29 "full_name": re.compile( 

30 r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b" 

31 ), 

32 "year": re.compile(r"\b(19\d{2}|20\d{2})\b"), 

33 "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"), 

34 "dimension": re.compile( 

35 r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)", 

36 re.I, 

37 ), 

38 "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"), 

39 "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"), 

40 "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"), 

41 } 

42 

43 def analyze_initial( 

44 self, query: str, search_results: Union[str, List[Dict]] 

45 ) -> Dict[str, Any]: 

46 """Initial analysis with precision extraction.""" 

47 documents = self._create_documents(search_results) 

48 formatted_sources = self._format_sources(documents) 

49 

50 # Determine question type for targeted extraction 

51 question_type = self._identify_question_type(query) 

52 

53 current_timestamp = datetime.now(timezone.utc).strftime( 

54 "%Y-%m-%d %H:%M" 

55 ) 

56 

57 output_prefix = self._get_output_instruction_prefix() 

58 

59 prompt = f"""{output_prefix}Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc. 

60 

61Question: {query} 

62Question Type: {question_type} 

63 

64Sources: 

65{formatted_sources} 

66 

67Current time is {current_timestamp} UTC for verifying temporal references in sources. 

68 

69PRECISION INSTRUCTIONS: 

701. Extract the EXACT answer as it appears in the sources 

712. For names: Include FULL names with all parts (first, middle, last) 

723. For numbers: Include exact values with units if present 

734. For single-answer questions: Provide ONLY ONE answer, not multiple options 

745. For dimensions: Specify the exact measurement type (height, length, width) 

756. Citations should support the specific answer given 

76 

77Format: Start with the direct, precise answer, then explain with citations.""" 

78 

79 response = self.llm.invoke(prompt) 

80 if not isinstance(response, str): 

81 response = response.content 

82 

83 # Apply precision extraction if needed 

84 response = self._apply_precision_extraction( 

85 response, query, question_type, formatted_sources 

86 ) 

87 

88 return {"content": response, "documents": documents} 

89 

90 def analyze_followup( 

91 self, 

92 question: str, 

93 search_results: Union[str, List[Dict]], 

94 previous_knowledge: str, 

95 nr_of_links: int, 

96 ) -> Dict[str, Any]: 

97 """Follow-up analysis with precision extraction.""" 

98 documents = self._create_documents( 

99 search_results, nr_of_links=nr_of_links 

100 ) 

101 formatted_sources = self._format_sources(documents) 

102 

103 question_type = self._identify_question_type(question) 

104 

105 # Extract key facts from previous knowledge 

106 key_facts = self._extract_key_facts(previous_knowledge, question_type) 

107 

108 current_timestamp = datetime.now(timezone.utc).strftime( 

109 "%Y-%m-%d %H:%M" 

110 ) 

111 

112 output_prefix = self._get_output_instruction_prefix() 

113 

114 prompt = f"""{output_prefix}Using the previous knowledge and new sources, provide a PRECISE answer to the question. 

115 

116Previous Key Facts: 

117{key_facts} 

118 

119Question: {question} 

120Question Type: {question_type} 

121 

122New Sources: 

123{formatted_sources} 

124 

125Current time is {current_timestamp} UTC for verifying temporal references in sources. 

126 

127PRECISION REQUIREMENTS: 

1281. Build on previous knowledge to provide the MOST COMPLETE answer 

1292. If a full name was partially found before, complete it now 

1303. If multiple candidates exist, select the one with the MOST evidence 

1314. For measurements, ensure units and dimension types match the question 

1325. Reconcile any conflicts by choosing the most frequently cited answer 

133 

134Provide the precise answer with citations.""" 

135 

136 response = self.llm.invoke(prompt) 

137 content = response.content 

138 

139 # Apply precision extraction 

140 content = self._apply_precision_extraction( 

141 content, question, question_type, formatted_sources 

142 ) 

143 

144 return {"content": content, "documents": documents} 

145 

146 def _identify_question_type(self, query: str) -> str: 

147 """Identify the type of question for targeted extraction.""" 

148 query_lower = query.lower() 

149 

150 # Name questions 

151 if any( 

152 phrase in query_lower 

153 for phrase in ["full name", "name of", "who was", "who is"] 

154 ): 

155 if "full name" in query_lower: 

156 return "full_name" 

157 return "name" 

158 

159 # Location questions 

160 if any( 

161 phrase in query_lower 

162 for phrase in ["where", "location", "city", "country", "place"] 

163 ): 

164 return "location" 

165 

166 # Temporal questions 

167 if any(phrase in query_lower for phrase in ["when", "year", "date"]): 

168 return "temporal" 

169 

170 # Numerical questions 

171 if any( 

172 phrase in query_lower 

173 for phrase in ["how many", "how much", "number", "count"] 

174 ): 

175 return "number" 

176 

177 # Score/result questions 

178 if any( 

179 phrase in query_lower 

180 for phrase in ["score", "result", "final", "outcome"] 

181 ): 

182 return "score" 

183 

184 # Dimension questions 

185 if any( 

186 phrase in query_lower 

187 for phrase in [ 

188 "height", 

189 "length", 

190 "width", 

191 "size", 

192 "tall", 

193 "long", 

194 "wide", 

195 ] 

196 ): 

197 return "dimension" 

198 

199 # Single answer questions 

200 if query_lower.startswith("which") and "one" in query_lower: 

201 return "single_choice" 

202 

203 return "general" 

204 

205 def _apply_precision_extraction( 

206 self, content: str, query: str, question_type: str, sources: str 

207 ) -> str: 

208 """Apply precision extraction based on question type.""" 

209 

210 if question_type == "full_name": 

211 return self._extract_full_name(content, query, sources) 

212 if question_type == "name": 

213 return self._extract_best_name(content, query, sources) 

214 if question_type == "single_choice": 

215 return self._extract_single_answer(content, query, sources) 

216 if question_type == "dimension": 

217 return self._extract_dimension(content, query, sources) 

218 if question_type == "score": 

219 return self._extract_score(content, query, sources) 

220 if question_type == "temporal": 

221 return self._extract_temporal(content, query, sources) 

222 if question_type == "number": 

223 return self._extract_number(content, query, sources) 

224 

225 return content 

226 

227 def _extract_full_name(self, content: str, query: str, sources: str) -> str: 

228 """Extract complete full names.""" 

229 # First, use LLM to identify all name variations 

230 extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources. 

231 

232Question: {query} 

233 

234Content: {content[:2000]} 

235Sources: {sources[:2000]} 

236 

237List all name variations found: 

2381. Shortest version: 

2392. Longest/most complete version: 

2403. Most frequently mentioned version: 

241 

242Which is the FULL name (including middle name if present)?""" 

243 

244 try: 

245 extraction = self.llm.invoke(extraction_prompt).content 

246 

247 # Extract the identified full name 

248 if "full name" in extraction.lower(): 248 ↛ 264line 248 didn't jump to line 264 because the condition on line 248 was always true

249 lines = extraction.split("\n") 

250 for line in lines: 

251 if "full name" in line.lower() or "longest" in line.lower(): 251 ↛ 250line 251 didn't jump to line 250 because the condition on line 251 was always true

252 # Extract name from this line 

253 matches = self.answer_patterns["full_name"].findall( 

254 line 

255 ) 

256 if matches: 

257 # Choose the longest match 

258 full_name = max( 

259 matches, key=lambda x: len(x.split()) 

260 ) 

261 return f"{full_name}. {content}" 

262 

263 # Fallback: find all names and pick the longest 

264 all_names = self.answer_patterns["full_name"].findall( 

265 content + " " + sources 

266 ) 

267 if all_names: 267 ↛ 289line 267 didn't jump to line 289 because the condition on line 267 was always true

268 # Group similar names and pick the longest variant 

269 name_groups: Dict[str, List[str]] = {} 

270 for name in all_names: 

271 last_word = name.split()[-1] 

272 if last_word not in name_groups: 

273 name_groups[last_word] = [] 

274 name_groups[last_word].append(name) 

275 

276 # Find the group with the most complete name 

277 best_name = "" 

278 for group in name_groups.values(): 

279 longest_in_group = max(group, key=lambda x: len(x.split())) 

280 if len(longest_in_group.split()) > len(best_name.split()): 280 ↛ 278line 280 didn't jump to line 278 because the condition on line 280 was always true

281 best_name = longest_in_group 

282 

283 if best_name: 283 ↛ 289line 283 didn't jump to line 289 because the condition on line 283 was always true

284 return f"{best_name}. {content}" 

285 

286 except Exception: 

287 logger.exception("Error in full name extraction") 

288 

289 return content 

290 

291 def _extract_single_answer( 

292 self, content: str, query: str, sources: str 

293 ) -> str: 

294 """Extract a single answer when multiple options might be present.""" 

295 extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer. 

296 

297Question: {query} 

298Content: {content[:1500]} 

299 

300Rules: 

3011. If multiple items are listed, identify which ONE actually answers the question 

3022. Look for the PRIMARY or FIRST mentioned item 

3033. Do not include alternatives or additional options 

304 

305The single answer is:""" 

306 

307 try: 

308 answer = self.llm.invoke(extraction_prompt).content.strip() 

309 

310 # Clean up the answer 

311 answer = answer.split(",")[ 

312 0 

313 ].strip() # Take only first if comma-separated 

314 answer = answer.split(" and ")[ 

315 0 

316 ].strip() # Take only first if "and"-separated 

317 answer = answer.split(" or ")[ 

318 0 

319 ].strip() # Take only first if "or"-separated 

320 

321 return f"{answer}. {content}" 

322 

323 except Exception: 

324 logger.exception("Error in single answer extraction") 

325 

326 return content 

327 

328 def _extract_dimension(self, content: str, query: str, sources: str) -> str: 

329 """Extract specific dimensions with correct units and context awareness.""" 

330 # Enhanced dimension type detection 

331 dimension_types = { 

332 "height": ["height", "tall", "high", "elevation", "altitude"], 

333 "length": ["length", "long", "distance", "reach", "span"], 

334 "width": ["width", "wide", "breadth", "diameter"], 

335 "depth": ["depth", "deep", "thickness"], 

336 "weight": ["weight", "weigh", "heavy", "mass"], 

337 "speed": ["speed", "fast", "velocity", "mph", "kmh"], 

338 "area": ["area", "square"], 

339 "volume": ["volume", "cubic"], 

340 } 

341 

342 query_lower = query.lower() 

343 dimension_type = None 

344 dimension_keywords = [] 

345 

346 # Find the most specific dimension type 

347 for dim_type, keywords in dimension_types.items(): 

348 matching_keywords = [kw for kw in keywords if kw in query_lower] 

349 if matching_keywords: 

350 dimension_type = dim_type 

351 dimension_keywords = matching_keywords 

352 break 

353 

354 extraction_prompt = f"""Extract the EXACT measurement that answers this question. 

355 

356Question: {query} 

357Content: {content[:1500]} 

358 

359Rules: 

3601. Find the specific {dimension_type or "dimension"} measurement 

3612. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet") 

3623. Distinguish between different types of measurements: 

363 - Height/tall: vertical measurements 

364 - Length/long: horizontal distance 

365 - Width/wide: horizontal breadth 

3664. Look for context clues near the measurement 

3675. If multiple measurements, choose the one that matches the question type 

368 

369The exact {dimension_type or "dimension"} is:""" 

370 

371 try: 

372 answer = self.llm.invoke(extraction_prompt).content.strip() 

373 

374 # Clean and validate the answer 

375 import re 

376 

377 measurement_match = re.search( 

378 r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer 

379 ) 

380 if measurement_match: 

381 number, unit = measurement_match.groups() 

382 clean_answer = f"{number} {unit}" 

383 return f"{clean_answer}. {content}" 

384 

385 # Fallback: intelligent pattern matching 

386 all_dimensions = self.answer_patterns["dimension"].findall( 

387 content + " " + sources 

388 ) 

389 if all_dimensions: 

390 # Score dimensions based on context and dimension type 

391 scored_dimensions = [] 

392 

393 for dim in all_dimensions: 

394 number, unit = dim 

395 dim_str = f"{number} {unit}" 

396 score = 0 

397 

398 # Find the dimension in content 

399 pos = content.find(dim_str) 

400 if pos >= 0: 400 ↛ 452line 400 didn't jump to line 452 because the condition on line 400 was always true

401 # Get context around this measurement 

402 context = content[max(0, pos - 100) : pos + 100].lower() 

403 

404 # Score based on dimension keywords in context 

405 for keyword in dimension_keywords: 

406 if keyword in context: 

407 score += 10 

408 

409 # Score based on unit appropriateness 

410 unit_lower = unit.lower() 

411 if ( 

412 ( 

413 dimension_type == "height" 

414 and any( 

415 u in unit_lower 

416 for u in ["m", "meter", "ft", "feet", "cm"] 

417 ) 

418 ) 

419 or ( 

420 dimension_type == "length" 

421 and any( 

422 u in unit_lower 

423 for u in ["m", "meter", "km", "mile", "ft"] 

424 ) 

425 ) 

426 or ( 

427 dimension_type == "weight" 

428 and any( 

429 u in unit_lower 

430 for u in [ 

431 "kg", 

432 "lb", 

433 "pound", 

434 "gram", 

435 "ton", 

436 ] 

437 ) 

438 ) 

439 or ( 

440 dimension_type == "speed" 

441 and any( 

442 u in unit_lower 

443 for u in ["mph", "kmh", "km/h", "m/s"] 

444 ) 

445 ) 

446 ): 

447 score += 5 

448 

449 # Prefer measurements closer to the beginning (more likely to be primary) 

450 score += max(0, 5 - (pos // 100)) 

451 

452 scored_dimensions.append((score, dim_str)) 

453 

454 # Return the highest scoring dimension 

455 if scored_dimensions: 455 ↛ 461line 455 didn't jump to line 461 because the condition on line 455 was always true

456 scored_dimensions.sort(key=lambda x: x[0], reverse=True) 

457 best_dimension = scored_dimensions[0][1] 

458 return f"{best_dimension}. {content}" 

459 

460 # Final fallback: first dimension 

461 return ( 

462 f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}" 

463 ) 

464 

465 except Exception: 

466 logger.exception("Error in dimension extraction") 

467 

468 return content 

469 

470 def _extract_score(self, content: str, query: str, sources: str) -> str: 

471 """Extract game scores or results.""" 

472 # Find all score patterns 

473 scores = self.answer_patterns["score"].findall(content + " " + sources) 

474 

475 if scores: 

476 # Use LLM to identify the correct score 

477 extraction_prompt = f"""Which score/result answers this question? 

478 

479Question: {query} 

480Found scores: {scores} 

481Context: {content[:1000]} 

482 

483The answer is:""" 

484 

485 try: 

486 answer = self.llm.invoke(extraction_prompt).content.strip() 

487 return f"{answer}. {content}" 

488 except Exception: 

489 # Return first score found if LLM extraction fails 

490 return f"{scores[0][0]}-{scores[0][1]}. {content}" 

491 

492 return content 

493 

494 def _extract_temporal(self, content: str, query: str, sources: str) -> str: 

495 """Extract dates or years.""" 

496 # Find all year patterns 

497 years = self.answer_patterns["year"].findall(content + " " + sources) 

498 

499 if years: 

500 # Use LLM to pick the right one 

501 extraction_prompt = f"""Which date/year specifically answers this question? 

502 

503Question: {query} 

504Found years: {set(years)} 

505Context: {content[:1000]} 

506 

507The answer is:""" 

508 

509 try: 

510 answer = self.llm.invoke(extraction_prompt).content.strip() 

511 # Clean to just the year/date 

512 year_match = self.answer_patterns["year"].search(answer) 

513 if year_match: 

514 return f"{year_match.group()}. {content}" 

515 return f"{answer}. {content}" 

516 except Exception: 

517 # Fallback to first found year if LLM extraction fails 

518 return f"{years[0]}. {content}" 

519 

520 return content 

521 

522 def _extract_number(self, content: str, query: str, sources: str) -> str: 

523 """Extract specific numbers.""" 

524 # Find all numbers 

525 numbers = self.answer_patterns["number"].findall( 

526 content + " " + sources 

527 ) 

528 

529 if numbers: 

530 extraction_prompt = f"""Which number specifically answers this question? 

531 

532Question: {query} 

533Found numbers: {numbers[:10]} 

534Context: {content[:1000]} 

535 

536The answer is:""" 

537 

538 try: 

539 answer = self.llm.invoke(extraction_prompt).content.strip() 

540 return f"{answer}. {content}" 

541 except Exception: 

542 # Fallback to first found number if LLM extraction fails 

543 return f"{numbers[0]}. {content}" 

544 

545 return content 

546 

547 def _extract_best_name(self, content: str, query: str, sources: str) -> str: 

548 """Extract the best matching name (not necessarily full).""" 

549 # Find all potential names 

550 names = self.answer_patterns["full_name"].findall( 

551 content + " " + sources 

552 ) 

553 

554 if names: 

555 # Count frequency 

556 name_counts: Dict[str, int] = {} 

557 for name in names: 

558 name_counts[name] = name_counts.get(name, 0) + 1 

559 

560 # Get most frequent 

561 best_name = max(name_counts.items(), key=lambda x: x[1])[0] 

562 return f"{best_name}. {content}" 

563 

564 return content 

565 

566 def _extract_key_facts( 

567 self, previous_knowledge: str, question_type: str 

568 ) -> str: 

569 """Extract key facts from previous knowledge.""" 

570 extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge: 

571 

572{previous_knowledge[:1500]} 

573 

574List the most important facts (names, numbers, dates) found:""" 

575 

576 try: 

577 facts = str(self.llm.invoke(extraction_prompt).content) 

578 return facts[:500] 

579 except Exception: 

580 # Fallback to truncated previous knowledge if LLM extraction fails 

581 return previous_knowledge[:500]