Coverage for src / local_deep_research / citation_handlers / precision_extraction_handler.py: 86%

204 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Precision Extraction Citation Handler 

3 

4This handler focuses on extracting precise, complete answers for SimpleQA-style questions. 

5It includes specialized extractors for: 

6- Full names (including middle names) 

7- Single answers when only one is requested 

8- Dimension-aware measurements 

9- Specific entities without extra information 

10""" 

11 

12import re 

13from datetime import datetime, timezone 

14from typing import Any, Dict, List, Union 

15 

16from loguru import logger 

17 

18from .base_citation_handler import BaseCitationHandler 

19 

20 

21class PrecisionExtractionHandler(BaseCitationHandler): 

22 """Citation handler optimized for precise answer extraction.""" 

23 

24 def __init__(self, *args, **kwargs): 

25 super().__init__(*args, **kwargs) 

26 

27 # Answer type patterns 

28 self.answer_patterns = { 

29 "full_name": re.compile( 

30 r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b" 

31 ), 

32 "year": re.compile(r"\b(19\d{2}|20\d{2})\b"), 

33 "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"), 

34 "dimension": re.compile( 

35 r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)", 

36 re.I, 

37 ), 

38 "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"), 

39 "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"), 

40 "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"), 

41 } 

42 

43 def analyze_initial( 

44 self, query: str, search_results: Union[str, List[Dict]] 

45 ) -> Dict[str, Any]: 

46 """Initial analysis with precision extraction.""" 

47 documents = self._create_documents(search_results) 

48 formatted_sources = self._format_sources(documents) 

49 

50 # Determine question type for targeted extraction 

51 question_type = self._identify_question_type(query) 

52 

53 current_timestamp = datetime.now(timezone.utc).strftime( 

54 "%Y-%m-%d %H:%M" 

55 ) 

56 

57 output_prefix = self._get_output_instruction_prefix() 

58 

59 prompt = f"""{output_prefix}Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc. 

60 

61Question: {query} 

62Question Type: {question_type} 

63 

64Sources: 

65{formatted_sources} 

66 

67Current time is {current_timestamp} UTC for verifying temporal references in sources. 

68 

69PRECISION INSTRUCTIONS: 

701. Extract the EXACT answer as it appears in the sources 

712. For names: Include FULL names with all parts (first, middle, last) 

723. For numbers: Include exact values with units if present 

734. For single-answer questions: Provide ONLY ONE answer, not multiple options 

745. For dimensions: Specify the exact measurement type (height, length, width) 

756. Citations should support the specific answer given 

76 

77Format: Start with the direct, precise answer, then explain with citations.""" 

78 

79 response = self.llm.invoke(prompt) 

80 if not isinstance(response, str): 

81 response = response.content 

82 

83 # Apply precision extraction if needed 

84 response = self._apply_precision_extraction( 

85 response, query, question_type, formatted_sources 

86 ) 

87 

88 return {"content": response, "documents": documents} 

89 

90 def analyze_followup( 

91 self, 

92 question: str, 

93 search_results: Union[str, List[Dict]], 

94 previous_knowledge: str, 

95 nr_of_links: int, 

96 ) -> Dict[str, Any]: 

97 """Follow-up analysis with precision extraction.""" 

98 documents = self._create_documents( 

99 search_results, nr_of_links=nr_of_links 

100 ) 

101 formatted_sources = self._format_sources(documents) 

102 

103 question_type = self._identify_question_type(question) 

104 

105 # Extract key facts from previous knowledge 

106 key_facts = self._extract_key_facts(previous_knowledge, question_type) 

107 

108 current_timestamp = datetime.now(timezone.utc).strftime( 

109 "%Y-%m-%d %H:%M" 

110 ) 

111 

112 output_prefix = self._get_output_instruction_prefix() 

113 

114 prompt = f"""{output_prefix}Using the previous knowledge and new sources, provide a PRECISE answer to the question. 

115 

116Previous Key Facts: 

117{key_facts} 

118 

119Question: {question} 

120Question Type: {question_type} 

121 

122New Sources: 

123{formatted_sources} 

124 

125Current time is {current_timestamp} UTC for verifying temporal references in sources. 

126 

127PRECISION REQUIREMENTS: 

1281. Build on previous knowledge to provide the MOST COMPLETE answer 

1292. If a full name was partially found before, complete it now 

1303. If multiple candidates exist, select the one with the MOST evidence 

1314. For measurements, ensure units and dimension types match the question 

1325. Reconcile any conflicts by choosing the most frequently cited answer 

133 

134Provide the precise answer with citations.""" 

135 

136 response = self.llm.invoke(prompt) 

137 content = response.content 

138 

139 # Apply precision extraction 

140 content = self._apply_precision_extraction( 

141 content, question, question_type, formatted_sources 

142 ) 

143 

144 return {"content": content, "documents": documents} 

145 

146 def _identify_question_type(self, query: str) -> str: 

147 """Identify the type of question for targeted extraction.""" 

148 query_lower = query.lower() 

149 

150 # Name questions 

151 if any( 

152 phrase in query_lower 

153 for phrase in ["full name", "name of", "who was", "who is"] 

154 ): 

155 if "full name" in query_lower: 

156 return "full_name" 

157 return "name" 

158 

159 # Location questions 

160 if any( 

161 phrase in query_lower 

162 for phrase in ["where", "location", "city", "country", "place"] 

163 ): 

164 return "location" 

165 

166 # Temporal questions 

167 if any(phrase in query_lower for phrase in ["when", "year", "date"]): 

168 return "temporal" 

169 

170 # Numerical questions 

171 if any( 

172 phrase in query_lower 

173 for phrase in ["how many", "how much", "number", "count"] 

174 ): 

175 return "number" 

176 

177 # Score/result questions 

178 if any( 

179 phrase in query_lower 

180 for phrase in ["score", "result", "final", "outcome"] 

181 ): 

182 return "score" 

183 

184 # Dimension questions 

185 if any( 

186 phrase in query_lower 

187 for phrase in [ 

188 "height", 

189 "length", 

190 "width", 

191 "size", 

192 "tall", 

193 "long", 

194 "wide", 

195 ] 

196 ): 

197 return "dimension" 

198 

199 # Single answer questions 

200 if query_lower.startswith("which") and "one" in query_lower: 

201 return "single_choice" 

202 

203 return "general" 

204 

205 def _apply_precision_extraction( 

206 self, content: str, query: str, question_type: str, sources: str 

207 ) -> str: 

208 """Apply precision extraction based on question type.""" 

209 

210 # Check if content already has a good answer in the first line 

211 # first_line = content.split(".")[0].strip() # Not currently used 

212 

213 if question_type == "full_name": 

214 return self._extract_full_name(content, query, sources) 

215 elif question_type == "name": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 return self._extract_best_name(content, query, sources) 

217 elif question_type == "single_choice": 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 return self._extract_single_answer(content, query, sources) 

219 elif question_type == "dimension": 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 return self._extract_dimension(content, query, sources) 

221 elif question_type == "score": 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 return self._extract_score(content, query, sources) 

223 elif question_type == "temporal": 

224 return self._extract_temporal(content, query, sources) 

225 elif question_type == "number": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 return self._extract_number(content, query, sources) 

227 

228 return content 

229 

230 def _extract_full_name(self, content: str, query: str, sources: str) -> str: 

231 """Extract complete full names.""" 

232 # First, use LLM to identify all name variations 

233 extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources. 

234 

235Question: {query} 

236 

237Content: {content[:2000]} 

238Sources: {sources[:2000]} 

239 

240List all name variations found: 

2411. Shortest version: 

2422. Longest/most complete version: 

2433. Most frequently mentioned version: 

244 

245Which is the FULL name (including middle name if present)?""" 

246 

247 try: 

248 extraction = self.llm.invoke(extraction_prompt).content 

249 

250 # Extract the identified full name 

251 if "full name" in extraction.lower(): 251 ↛ 267line 251 didn't jump to line 267 because the condition on line 251 was always true

252 lines = extraction.split("\n") 

253 for line in lines: 

254 if "full name" in line.lower() or "longest" in line.lower(): 254 ↛ 253line 254 didn't jump to line 253 because the condition on line 254 was always true

255 # Extract name from this line 

256 matches = self.answer_patterns["full_name"].findall( 

257 line 

258 ) 

259 if matches: 

260 # Choose the longest match 

261 full_name = max( 

262 matches, key=lambda x: len(x.split()) 

263 ) 

264 return f"{full_name}. {content}" 

265 

266 # Fallback: find all names and pick the longest 

267 all_names = self.answer_patterns["full_name"].findall( 

268 content + " " + sources 

269 ) 

270 if all_names: 270 ↛ 292line 270 didn't jump to line 292 because the condition on line 270 was always true

271 # Group similar names and pick the longest variant 

272 name_groups = {} 

273 for name in all_names: 

274 last_word = name.split()[-1] 

275 if last_word not in name_groups: 

276 name_groups[last_word] = [] 

277 name_groups[last_word].append(name) 

278 

279 # Find the group with the most complete name 

280 best_name = "" 

281 for group in name_groups.values(): 

282 longest_in_group = max(group, key=lambda x: len(x.split())) 

283 if len(longest_in_group.split()) > len(best_name.split()): 283 ↛ 281line 283 didn't jump to line 281 because the condition on line 283 was always true

284 best_name = longest_in_group 

285 

286 if best_name: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true

287 return f"{best_name}. {content}" 

288 

289 except Exception: 

290 logger.exception("Error in full name extraction") 

291 

292 return content 

293 

294 def _extract_single_answer( 

295 self, content: str, query: str, sources: str 

296 ) -> str: 

297 """Extract a single answer when multiple options might be present.""" 

298 extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer. 

299 

300Question: {query} 

301Content: {content[:1500]} 

302 

303Rules: 

3041. If multiple items are listed, identify which ONE actually answers the question 

3052. Look for the PRIMARY or FIRST mentioned item 

3063. Do not include alternatives or additional options 

307 

308The single answer is:""" 

309 

310 try: 

311 answer = self.llm.invoke(extraction_prompt).content.strip() 

312 

313 # Clean up the answer 

314 answer = answer.split(",")[ 

315 0 

316 ].strip() # Take only first if comma-separated 

317 answer = answer.split(" and ")[ 

318 0 

319 ].strip() # Take only first if "and"-separated 

320 answer = answer.split(" or ")[ 

321 0 

322 ].strip() # Take only first if "or"-separated 

323 

324 return f"{answer}. {content}" 

325 

326 except Exception: 

327 logger.exception("Error in single answer extraction") 

328 

329 return content 

330 

331 def _extract_dimension(self, content: str, query: str, sources: str) -> str: 

332 """Extract specific dimensions with correct units and context awareness.""" 

333 # Enhanced dimension type detection 

334 dimension_types = { 

335 "height": ["height", "tall", "high", "elevation", "altitude"], 

336 "length": ["length", "long", "distance", "reach", "span"], 

337 "width": ["width", "wide", "breadth", "diameter"], 

338 "depth": ["depth", "deep", "thickness"], 

339 "weight": ["weight", "weigh", "heavy", "mass"], 

340 "speed": ["speed", "fast", "velocity", "mph", "kmh"], 

341 "area": ["area", "square"], 

342 "volume": ["volume", "cubic"], 

343 } 

344 

345 query_lower = query.lower() 

346 dimension_type = None 

347 dimension_keywords = [] 

348 

349 # Find the most specific dimension type 

350 for dim_type, keywords in dimension_types.items(): 350 ↛ 357line 350 didn't jump to line 357 because the loop on line 350 didn't complete

351 matching_keywords = [kw for kw in keywords if kw in query_lower] 

352 if matching_keywords: 352 ↛ 350line 352 didn't jump to line 350 because the condition on line 352 was always true

353 dimension_type = dim_type 

354 dimension_keywords = matching_keywords 

355 break 

356 

357 extraction_prompt = f"""Extract the EXACT measurement that answers this question. 

358 

359Question: {query} 

360Content: {content[:1500]} 

361 

362Rules: 

3631. Find the specific {dimension_type or "dimension"} measurement 

3642. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet") 

3653. Distinguish between different types of measurements: 

366 - Height/tall: vertical measurements 

367 - Length/long: horizontal distance 

368 - Width/wide: horizontal breadth 

3694. Look for context clues near the measurement 

3705. If multiple measurements, choose the one that matches the question type 

371 

372The exact {dimension_type or "dimension"} is:""" 

373 

374 try: 

375 answer = self.llm.invoke(extraction_prompt).content.strip() 

376 

377 # Clean and validate the answer 

378 import re 

379 

380 measurement_match = re.search( 

381 r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer 

382 ) 

383 if measurement_match: 

384 number, unit = measurement_match.groups() 

385 clean_answer = f"{number} {unit}" 

386 return f"{clean_answer}. {content}" 

387 

388 # Fallback: intelligent pattern matching 

389 all_dimensions = self.answer_patterns["dimension"].findall( 

390 content + " " + sources 

391 ) 

392 if all_dimensions: 392 ↛ 471line 392 didn't jump to line 471 because the condition on line 392 was always true

393 # Score dimensions based on context and dimension type 

394 scored_dimensions = [] 

395 

396 for dim in all_dimensions: 

397 number, unit = dim 

398 dim_str = f"{number} {unit}" 

399 score = 0 

400 

401 # Find the dimension in content 

402 pos = content.find(dim_str) 

403 if pos >= 0: 403 ↛ 455line 403 didn't jump to line 455 because the condition on line 403 was always true

404 # Get context around this measurement 

405 context = content[max(0, pos - 100) : pos + 100].lower() 

406 

407 # Score based on dimension keywords in context 

408 for keyword in dimension_keywords: 

409 if keyword in context: 409 ↛ 408line 409 didn't jump to line 408 because the condition on line 409 was always true

410 score += 10 

411 

412 # Score based on unit appropriateness 

413 unit_lower = unit.lower() 

414 if ( 414 ↛ 453line 414 didn't jump to line 453 because the condition on line 414 was always true

415 ( 

416 dimension_type == "height" 

417 and any( 

418 u in unit_lower 

419 for u in ["m", "meter", "ft", "feet", "cm"] 

420 ) 

421 ) 

422 or ( 

423 dimension_type == "length" 

424 and any( 

425 u in unit_lower 

426 for u in ["m", "meter", "km", "mile", "ft"] 

427 ) 

428 ) 

429 or ( 

430 dimension_type == "weight" 

431 and any( 

432 u in unit_lower 

433 for u in [ 

434 "kg", 

435 "lb", 

436 "pound", 

437 "gram", 

438 "ton", 

439 ] 

440 ) 

441 ) 

442 or ( 

443 dimension_type == "speed" 

444 and any( 

445 u in unit_lower 

446 for u in ["mph", "kmh", "km/h", "m/s"] 

447 ) 

448 ) 

449 ): 

450 score += 5 

451 

452 # Prefer measurements closer to the beginning (more likely to be primary) 

453 score += max(0, 5 - (pos / 100)) 

454 

455 scored_dimensions.append((score, dim_str)) 

456 

457 # Return the highest scoring dimension 

458 if scored_dimensions: 458 ↛ 464line 458 didn't jump to line 464 because the condition on line 458 was always true

459 scored_dimensions.sort(key=lambda x: x[0], reverse=True) 

460 best_dimension = scored_dimensions[0][1] 

461 return f"{best_dimension}. {content}" 

462 

463 # Final fallback: first dimension 

464 return ( 

465 f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}" 

466 ) 

467 

468 except Exception: 

469 logger.exception("Error in dimension extraction") 

470 

471 return content 

472 

473 def _extract_score(self, content: str, query: str, sources: str) -> str: 

474 """Extract game scores or results.""" 

475 # Find all score patterns 

476 scores = self.answer_patterns["score"].findall(content + " " + sources) 

477 

478 if scores: 

479 # Use LLM to identify the correct score 

480 extraction_prompt = f"""Which score/result answers this question? 

481 

482Question: {query} 

483Found scores: {scores} 

484Context: {content[:1000]} 

485 

486The answer is:""" 

487 

488 try: 

489 answer = self.llm.invoke(extraction_prompt).content.strip() 

490 return f"{answer}. {content}" 

491 except Exception: 

492 # Return first score found if LLM extraction fails 

493 return f"{scores[0][0]}-{scores[0][1]}. {content}" 

494 

495 return content 

496 

497 def _extract_temporal(self, content: str, query: str, sources: str) -> str: 

498 """Extract dates or years.""" 

499 # Find all year patterns 

500 years = self.answer_patterns["year"].findall(content + " " + sources) 

501 

502 if years: 

503 # Use LLM to pick the right one 

504 extraction_prompt = f"""Which date/year specifically answers this question? 

505 

506Question: {query} 

507Found years: {set(years)} 

508Context: {content[:1000]} 

509 

510The answer is:""" 

511 

512 try: 

513 answer = self.llm.invoke(extraction_prompt).content.strip() 

514 # Clean to just the year/date 

515 year_match = self.answer_patterns["year"].search(answer) 

516 if year_match: 516 ↛ 518line 516 didn't jump to line 518 because the condition on line 516 was always true

517 return f"{year_match.group()}. {content}" 

518 return f"{answer}. {content}" 

519 except Exception: 

520 # Fallback to first found year if LLM extraction fails 

521 return f"{years[0]}. {content}" 

522 

523 return content 

524 

525 def _extract_number(self, content: str, query: str, sources: str) -> str: 

526 """Extract specific numbers.""" 

527 # Find all numbers 

528 numbers = self.answer_patterns["number"].findall( 

529 content + " " + sources 

530 ) 

531 

532 if numbers: 532 ↛ 548line 532 didn't jump to line 548 because the condition on line 532 was always true

533 extraction_prompt = f"""Which number specifically answers this question? 

534 

535Question: {query} 

536Found numbers: {numbers[:10]} 

537Context: {content[:1000]} 

538 

539The answer is:""" 

540 

541 try: 

542 answer = self.llm.invoke(extraction_prompt).content.strip() 

543 return f"{answer}. {content}" 

544 except Exception: 

545 # Fallback to first found number if LLM extraction fails 

546 return f"{numbers[0]}. {content}" 

547 

548 return content 

549 

550 def _extract_best_name(self, content: str, query: str, sources: str) -> str: 

551 """Extract the best matching name (not necessarily full).""" 

552 # Find all potential names 

553 names = self.answer_patterns["full_name"].findall( 

554 content + " " + sources 

555 ) 

556 

557 if names: 557 ↛ 567line 557 didn't jump to line 567 because the condition on line 557 was always true

558 # Count frequency 

559 name_counts = {} 

560 for name in names: 

561 name_counts[name] = name_counts.get(name, 0) + 1 

562 

563 # Get most frequent 

564 best_name = max(name_counts.items(), key=lambda x: x[1])[0] 

565 return f"{best_name}. {content}" 

566 

567 return content 

568 

569 def _extract_key_facts( 

570 self, previous_knowledge: str, question_type: str 

571 ) -> str: 

572 """Extract key facts from previous knowledge.""" 

573 extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge: 

574 

575{previous_knowledge[:1500]} 

576 

577List the most important facts (names, numbers, dates) found:""" 

578 

579 try: 

580 facts = self.llm.invoke(extraction_prompt).content 

581 return facts[:500] 

582 except Exception: 

583 # Fallback to truncated previous knowledge if LLM extraction fails 

584 return previous_knowledge[:500]