Coverage for src / local_deep_research / citation_handlers / precision_extraction_handler.py: 97%
204 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Precision Extraction Citation Handler
4This handler focuses on extracting precise, complete answers for SimpleQA-style questions.
5It includes specialized extractors for:
6- Full names (including middle names)
7- Single answers when only one is requested
8- Dimension-aware measurements
9- Specific entities without extra information
10"""
12import re
13from datetime import datetime, timezone
14from typing import Any, Dict, List, Union
16from loguru import logger
18from .base_citation_handler import BaseCitationHandler
21class PrecisionExtractionHandler(BaseCitationHandler):
22 """Citation handler optimized for precise answer extraction."""
24 def __init__(self, *args, **kwargs):
25 super().__init__(*args, **kwargs)
27 # Answer type patterns
28 self.answer_patterns = {
29 "full_name": re.compile(
30 r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b"
31 ),
32 "year": re.compile(r"\b(19\d{2}|20\d{2})\b"),
33 "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"),
34 "dimension": re.compile(
35 r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)",
36 re.I,
37 ),
38 "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"),
39 "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"),
40 "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"),
41 }
43 def analyze_initial(
44 self, query: str, search_results: Union[str, List[Dict]]
45 ) -> Dict[str, Any]:
46 """Initial analysis with precision extraction."""
47 documents = self._create_documents(search_results)
48 formatted_sources = self._format_sources(documents)
50 # Determine question type for targeted extraction
51 question_type = self._identify_question_type(query)
53 current_timestamp = datetime.now(timezone.utc).strftime(
54 "%Y-%m-%d %H:%M"
55 )
57 output_prefix = self._get_output_instruction_prefix()
59 prompt = f"""{output_prefix}Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc.
61Question: {query}
62Question Type: {question_type}
64Sources:
65{formatted_sources}
67Current time is {current_timestamp} UTC for verifying temporal references in sources.
69PRECISION INSTRUCTIONS:
701. Extract the EXACT answer as it appears in the sources
712. For names: Include FULL names with all parts (first, middle, last)
723. For numbers: Include exact values with units if present
734. For single-answer questions: Provide ONLY ONE answer, not multiple options
745. For dimensions: Specify the exact measurement type (height, length, width)
756. Citations should support the specific answer given
77Format: Start with the direct, precise answer, then explain with citations."""
79 response = self.llm.invoke(prompt)
80 if not isinstance(response, str):
81 response = response.content
83 # Apply precision extraction if needed
84 response = self._apply_precision_extraction(
85 response, query, question_type, formatted_sources
86 )
88 return {"content": response, "documents": documents}
90 def analyze_followup(
91 self,
92 question: str,
93 search_results: Union[str, List[Dict]],
94 previous_knowledge: str,
95 nr_of_links: int,
96 ) -> Dict[str, Any]:
97 """Follow-up analysis with precision extraction."""
98 documents = self._create_documents(
99 search_results, nr_of_links=nr_of_links
100 )
101 formatted_sources = self._format_sources(documents)
103 question_type = self._identify_question_type(question)
105 # Extract key facts from previous knowledge
106 key_facts = self._extract_key_facts(previous_knowledge, question_type)
108 current_timestamp = datetime.now(timezone.utc).strftime(
109 "%Y-%m-%d %H:%M"
110 )
112 output_prefix = self._get_output_instruction_prefix()
114 prompt = f"""{output_prefix}Using the previous knowledge and new sources, provide a PRECISE answer to the question.
116Previous Key Facts:
117{key_facts}
119Question: {question}
120Question Type: {question_type}
122New Sources:
123{formatted_sources}
125Current time is {current_timestamp} UTC for verifying temporal references in sources.
127PRECISION REQUIREMENTS:
1281. Build on previous knowledge to provide the MOST COMPLETE answer
1292. If a full name was partially found before, complete it now
1303. If multiple candidates exist, select the one with the MOST evidence
1314. For measurements, ensure units and dimension types match the question
1325. Reconcile any conflicts by choosing the most frequently cited answer
134Provide the precise answer with citations."""
136 response = self.llm.invoke(prompt)
137 content = response.content
139 # Apply precision extraction
140 content = self._apply_precision_extraction(
141 content, question, question_type, formatted_sources
142 )
144 return {"content": content, "documents": documents}
146 def _identify_question_type(self, query: str) -> str:
147 """Identify the type of question for targeted extraction."""
148 query_lower = query.lower()
150 # Name questions
151 if any(
152 phrase in query_lower
153 for phrase in ["full name", "name of", "who was", "who is"]
154 ):
155 if "full name" in query_lower:
156 return "full_name"
157 return "name"
159 # Location questions
160 if any(
161 phrase in query_lower
162 for phrase in ["where", "location", "city", "country", "place"]
163 ):
164 return "location"
166 # Temporal questions
167 if any(phrase in query_lower for phrase in ["when", "year", "date"]):
168 return "temporal"
170 # Numerical questions
171 if any(
172 phrase in query_lower
173 for phrase in ["how many", "how much", "number", "count"]
174 ):
175 return "number"
177 # Score/result questions
178 if any(
179 phrase in query_lower
180 for phrase in ["score", "result", "final", "outcome"]
181 ):
182 return "score"
184 # Dimension questions
185 if any(
186 phrase in query_lower
187 for phrase in [
188 "height",
189 "length",
190 "width",
191 "size",
192 "tall",
193 "long",
194 "wide",
195 ]
196 ):
197 return "dimension"
199 # Single answer questions
200 if query_lower.startswith("which") and "one" in query_lower:
201 return "single_choice"
203 return "general"
205 def _apply_precision_extraction(
206 self, content: str, query: str, question_type: str, sources: str
207 ) -> str:
208 """Apply precision extraction based on question type."""
210 if question_type == "full_name":
211 return self._extract_full_name(content, query, sources)
212 if question_type == "name":
213 return self._extract_best_name(content, query, sources)
214 if question_type == "single_choice":
215 return self._extract_single_answer(content, query, sources)
216 if question_type == "dimension":
217 return self._extract_dimension(content, query, sources)
218 if question_type == "score":
219 return self._extract_score(content, query, sources)
220 if question_type == "temporal":
221 return self._extract_temporal(content, query, sources)
222 if question_type == "number":
223 return self._extract_number(content, query, sources)
225 return content
227 def _extract_full_name(self, content: str, query: str, sources: str) -> str:
228 """Extract complete full names."""
229 # First, use LLM to identify all name variations
230 extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources.
232Question: {query}
234Content: {content[:2000]}
235Sources: {sources[:2000]}
237List all name variations found:
2381. Shortest version:
2392. Longest/most complete version:
2403. Most frequently mentioned version:
242Which is the FULL name (including middle name if present)?"""
244 try:
245 extraction = self.llm.invoke(extraction_prompt).content
247 # Extract the identified full name
248 if "full name" in extraction.lower(): 248 ↛ 264line 248 didn't jump to line 264 because the condition on line 248 was always true
249 lines = extraction.split("\n")
250 for line in lines:
251 if "full name" in line.lower() or "longest" in line.lower(): 251 ↛ 250line 251 didn't jump to line 250 because the condition on line 251 was always true
252 # Extract name from this line
253 matches = self.answer_patterns["full_name"].findall(
254 line
255 )
256 if matches:
257 # Choose the longest match
258 full_name = max(
259 matches, key=lambda x: len(x.split())
260 )
261 return f"{full_name}. {content}"
263 # Fallback: find all names and pick the longest
264 all_names = self.answer_patterns["full_name"].findall(
265 content + " " + sources
266 )
267 if all_names: 267 ↛ 289line 267 didn't jump to line 289 because the condition on line 267 was always true
268 # Group similar names and pick the longest variant
269 name_groups: Dict[str, List[str]] = {}
270 for name in all_names:
271 last_word = name.split()[-1]
272 if last_word not in name_groups:
273 name_groups[last_word] = []
274 name_groups[last_word].append(name)
276 # Find the group with the most complete name
277 best_name = ""
278 for group in name_groups.values():
279 longest_in_group = max(group, key=lambda x: len(x.split()))
280 if len(longest_in_group.split()) > len(best_name.split()): 280 ↛ 278line 280 didn't jump to line 278 because the condition on line 280 was always true
281 best_name = longest_in_group
283 if best_name: 283 ↛ 289line 283 didn't jump to line 289 because the condition on line 283 was always true
284 return f"{best_name}. {content}"
286 except Exception:
287 logger.exception("Error in full name extraction")
289 return content
291 def _extract_single_answer(
292 self, content: str, query: str, sources: str
293 ) -> str:
294 """Extract a single answer when multiple options might be present."""
295 extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer.
297Question: {query}
298Content: {content[:1500]}
300Rules:
3011. If multiple items are listed, identify which ONE actually answers the question
3022. Look for the PRIMARY or FIRST mentioned item
3033. Do not include alternatives or additional options
305The single answer is:"""
307 try:
308 answer = self.llm.invoke(extraction_prompt).content.strip()
310 # Clean up the answer
311 answer = answer.split(",")[
312 0
313 ].strip() # Take only first if comma-separated
314 answer = answer.split(" and ")[
315 0
316 ].strip() # Take only first if "and"-separated
317 answer = answer.split(" or ")[
318 0
319 ].strip() # Take only first if "or"-separated
321 return f"{answer}. {content}"
323 except Exception:
324 logger.exception("Error in single answer extraction")
326 return content
328 def _extract_dimension(self, content: str, query: str, sources: str) -> str:
329 """Extract specific dimensions with correct units and context awareness."""
330 # Enhanced dimension type detection
331 dimension_types = {
332 "height": ["height", "tall", "high", "elevation", "altitude"],
333 "length": ["length", "long", "distance", "reach", "span"],
334 "width": ["width", "wide", "breadth", "diameter"],
335 "depth": ["depth", "deep", "thickness"],
336 "weight": ["weight", "weigh", "heavy", "mass"],
337 "speed": ["speed", "fast", "velocity", "mph", "kmh"],
338 "area": ["area", "square"],
339 "volume": ["volume", "cubic"],
340 }
342 query_lower = query.lower()
343 dimension_type = None
344 dimension_keywords = []
346 # Find the most specific dimension type
347 for dim_type, keywords in dimension_types.items():
348 matching_keywords = [kw for kw in keywords if kw in query_lower]
349 if matching_keywords:
350 dimension_type = dim_type
351 dimension_keywords = matching_keywords
352 break
354 extraction_prompt = f"""Extract the EXACT measurement that answers this question.
356Question: {query}
357Content: {content[:1500]}
359Rules:
3601. Find the specific {dimension_type or "dimension"} measurement
3612. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet")
3623. Distinguish between different types of measurements:
363 - Height/tall: vertical measurements
364 - Length/long: horizontal distance
365 - Width/wide: horizontal breadth
3664. Look for context clues near the measurement
3675. If multiple measurements, choose the one that matches the question type
369The exact {dimension_type or "dimension"} is:"""
371 try:
372 answer = self.llm.invoke(extraction_prompt).content.strip()
374 # Clean and validate the answer
375 import re
377 measurement_match = re.search(
378 r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer
379 )
380 if measurement_match:
381 number, unit = measurement_match.groups()
382 clean_answer = f"{number} {unit}"
383 return f"{clean_answer}. {content}"
385 # Fallback: intelligent pattern matching
386 all_dimensions = self.answer_patterns["dimension"].findall(
387 content + " " + sources
388 )
389 if all_dimensions:
390 # Score dimensions based on context and dimension type
391 scored_dimensions = []
393 for dim in all_dimensions:
394 number, unit = dim
395 dim_str = f"{number} {unit}"
396 score = 0
398 # Find the dimension in content
399 pos = content.find(dim_str)
400 if pos >= 0: 400 ↛ 452line 400 didn't jump to line 452 because the condition on line 400 was always true
401 # Get context around this measurement
402 context = content[max(0, pos - 100) : pos + 100].lower()
404 # Score based on dimension keywords in context
405 for keyword in dimension_keywords:
406 if keyword in context:
407 score += 10
409 # Score based on unit appropriateness
410 unit_lower = unit.lower()
411 if (
412 (
413 dimension_type == "height"
414 and any(
415 u in unit_lower
416 for u in ["m", "meter", "ft", "feet", "cm"]
417 )
418 )
419 or (
420 dimension_type == "length"
421 and any(
422 u in unit_lower
423 for u in ["m", "meter", "km", "mile", "ft"]
424 )
425 )
426 or (
427 dimension_type == "weight"
428 and any(
429 u in unit_lower
430 for u in [
431 "kg",
432 "lb",
433 "pound",
434 "gram",
435 "ton",
436 ]
437 )
438 )
439 or (
440 dimension_type == "speed"
441 and any(
442 u in unit_lower
443 for u in ["mph", "kmh", "km/h", "m/s"]
444 )
445 )
446 ):
447 score += 5
449 # Prefer measurements closer to the beginning (more likely to be primary)
450 score += max(0, 5 - (pos // 100))
452 scored_dimensions.append((score, dim_str))
454 # Return the highest scoring dimension
455 if scored_dimensions: 455 ↛ 461line 455 didn't jump to line 461 because the condition on line 455 was always true
456 scored_dimensions.sort(key=lambda x: x[0], reverse=True)
457 best_dimension = scored_dimensions[0][1]
458 return f"{best_dimension}. {content}"
460 # Final fallback: first dimension
461 return (
462 f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}"
463 )
465 except Exception:
466 logger.exception("Error in dimension extraction")
468 return content
470 def _extract_score(self, content: str, query: str, sources: str) -> str:
471 """Extract game scores or results."""
472 # Find all score patterns
473 scores = self.answer_patterns["score"].findall(content + " " + sources)
475 if scores:
476 # Use LLM to identify the correct score
477 extraction_prompt = f"""Which score/result answers this question?
479Question: {query}
480Found scores: {scores}
481Context: {content[:1000]}
483The answer is:"""
485 try:
486 answer = self.llm.invoke(extraction_prompt).content.strip()
487 return f"{answer}. {content}"
488 except Exception:
489 # Return first score found if LLM extraction fails
490 return f"{scores[0][0]}-{scores[0][1]}. {content}"
492 return content
494 def _extract_temporal(self, content: str, query: str, sources: str) -> str:
495 """Extract dates or years."""
496 # Find all year patterns
497 years = self.answer_patterns["year"].findall(content + " " + sources)
499 if years:
500 # Use LLM to pick the right one
501 extraction_prompt = f"""Which date/year specifically answers this question?
503Question: {query}
504Found years: {set(years)}
505Context: {content[:1000]}
507The answer is:"""
509 try:
510 answer = self.llm.invoke(extraction_prompt).content.strip()
511 # Clean to just the year/date
512 year_match = self.answer_patterns["year"].search(answer)
513 if year_match:
514 return f"{year_match.group()}. {content}"
515 return f"{answer}. {content}"
516 except Exception:
517 # Fallback to first found year if LLM extraction fails
518 return f"{years[0]}. {content}"
520 return content
522 def _extract_number(self, content: str, query: str, sources: str) -> str:
523 """Extract specific numbers."""
524 # Find all numbers
525 numbers = self.answer_patterns["number"].findall(
526 content + " " + sources
527 )
529 if numbers:
530 extraction_prompt = f"""Which number specifically answers this question?
532Question: {query}
533Found numbers: {numbers[:10]}
534Context: {content[:1000]}
536The answer is:"""
538 try:
539 answer = self.llm.invoke(extraction_prompt).content.strip()
540 return f"{answer}. {content}"
541 except Exception:
542 # Fallback to first found number if LLM extraction fails
543 return f"{numbers[0]}. {content}"
545 return content
547 def _extract_best_name(self, content: str, query: str, sources: str) -> str:
548 """Extract the best matching name (not necessarily full)."""
549 # Find all potential names
550 names = self.answer_patterns["full_name"].findall(
551 content + " " + sources
552 )
554 if names:
555 # Count frequency
556 name_counts: Dict[str, int] = {}
557 for name in names:
558 name_counts[name] = name_counts.get(name, 0) + 1
560 # Get most frequent
561 best_name = max(name_counts.items(), key=lambda x: x[1])[0]
562 return f"{best_name}. {content}"
564 return content
566 def _extract_key_facts(
567 self, previous_knowledge: str, question_type: str
568 ) -> str:
569 """Extract key facts from previous knowledge."""
570 extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge:
572{previous_knowledge[:1500]}
574List the most important facts (names, numbers, dates) found:"""
576 try:
577 facts = str(self.llm.invoke(extraction_prompt).content)
578 return facts[:500]
579 except Exception:
580 # Fallback to truncated previous knowledge if LLM extraction fails
581 return previous_knowledge[:500]