Coverage for src / local_deep_research / citation_handlers / precision_extraction_handler.py: 86%
204 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Precision Extraction Citation Handler
4This handler focuses on extracting precise, complete answers for SimpleQA-style questions.
5It includes specialized extractors for:
6- Full names (including middle names)
7- Single answers when only one is requested
8- Dimension-aware measurements
9- Specific entities without extra information
10"""
12import re
13from datetime import datetime, timezone
14from typing import Any, Dict, List, Union
16from loguru import logger
18from .base_citation_handler import BaseCitationHandler
21class PrecisionExtractionHandler(BaseCitationHandler):
22 """Citation handler optimized for precise answer extraction."""
24 def __init__(self, *args, **kwargs):
25 super().__init__(*args, **kwargs)
27 # Answer type patterns
28 self.answer_patterns = {
29 "full_name": re.compile(
30 r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b"
31 ),
32 "year": re.compile(r"\b(19\d{2}|20\d{2})\b"),
33 "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"),
34 "dimension": re.compile(
35 r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)",
36 re.I,
37 ),
38 "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"),
39 "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"),
40 "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"),
41 }
43 def analyze_initial(
44 self, query: str, search_results: Union[str, List[Dict]]
45 ) -> Dict[str, Any]:
46 """Initial analysis with precision extraction."""
47 documents = self._create_documents(search_results)
48 formatted_sources = self._format_sources(documents)
50 # Determine question type for targeted extraction
51 question_type = self._identify_question_type(query)
53 current_timestamp = datetime.now(timezone.utc).strftime(
54 "%Y-%m-%d %H:%M"
55 )
57 output_prefix = self._get_output_instruction_prefix()
59 prompt = f"""{output_prefix}Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc.
61Question: {query}
62Question Type: {question_type}
64Sources:
65{formatted_sources}
67Current time is {current_timestamp} UTC for verifying temporal references in sources.
69PRECISION INSTRUCTIONS:
701. Extract the EXACT answer as it appears in the sources
712. For names: Include FULL names with all parts (first, middle, last)
723. For numbers: Include exact values with units if present
734. For single-answer questions: Provide ONLY ONE answer, not multiple options
745. For dimensions: Specify the exact measurement type (height, length, width)
756. Citations should support the specific answer given
77Format: Start with the direct, precise answer, then explain with citations."""
79 response = self.llm.invoke(prompt)
80 if not isinstance(response, str):
81 response = response.content
83 # Apply precision extraction if needed
84 response = self._apply_precision_extraction(
85 response, query, question_type, formatted_sources
86 )
88 return {"content": response, "documents": documents}
90 def analyze_followup(
91 self,
92 question: str,
93 search_results: Union[str, List[Dict]],
94 previous_knowledge: str,
95 nr_of_links: int,
96 ) -> Dict[str, Any]:
97 """Follow-up analysis with precision extraction."""
98 documents = self._create_documents(
99 search_results, nr_of_links=nr_of_links
100 )
101 formatted_sources = self._format_sources(documents)
103 question_type = self._identify_question_type(question)
105 # Extract key facts from previous knowledge
106 key_facts = self._extract_key_facts(previous_knowledge, question_type)
108 current_timestamp = datetime.now(timezone.utc).strftime(
109 "%Y-%m-%d %H:%M"
110 )
112 output_prefix = self._get_output_instruction_prefix()
114 prompt = f"""{output_prefix}Using the previous knowledge and new sources, provide a PRECISE answer to the question.
116Previous Key Facts:
117{key_facts}
119Question: {question}
120Question Type: {question_type}
122New Sources:
123{formatted_sources}
125Current time is {current_timestamp} UTC for verifying temporal references in sources.
127PRECISION REQUIREMENTS:
1281. Build on previous knowledge to provide the MOST COMPLETE answer
1292. If a full name was partially found before, complete it now
1303. If multiple candidates exist, select the one with the MOST evidence
1314. For measurements, ensure units and dimension types match the question
1325. Reconcile any conflicts by choosing the most frequently cited answer
134Provide the precise answer with citations."""
136 response = self.llm.invoke(prompt)
137 content = response.content
139 # Apply precision extraction
140 content = self._apply_precision_extraction(
141 content, question, question_type, formatted_sources
142 )
144 return {"content": content, "documents": documents}
146 def _identify_question_type(self, query: str) -> str:
147 """Identify the type of question for targeted extraction."""
148 query_lower = query.lower()
150 # Name questions
151 if any(
152 phrase in query_lower
153 for phrase in ["full name", "name of", "who was", "who is"]
154 ):
155 if "full name" in query_lower:
156 return "full_name"
157 return "name"
159 # Location questions
160 if any(
161 phrase in query_lower
162 for phrase in ["where", "location", "city", "country", "place"]
163 ):
164 return "location"
166 # Temporal questions
167 if any(phrase in query_lower for phrase in ["when", "year", "date"]):
168 return "temporal"
170 # Numerical questions
171 if any(
172 phrase in query_lower
173 for phrase in ["how many", "how much", "number", "count"]
174 ):
175 return "number"
177 # Score/result questions
178 if any(
179 phrase in query_lower
180 for phrase in ["score", "result", "final", "outcome"]
181 ):
182 return "score"
184 # Dimension questions
185 if any(
186 phrase in query_lower
187 for phrase in [
188 "height",
189 "length",
190 "width",
191 "size",
192 "tall",
193 "long",
194 "wide",
195 ]
196 ):
197 return "dimension"
199 # Single answer questions
200 if query_lower.startswith("which") and "one" in query_lower:
201 return "single_choice"
203 return "general"
205 def _apply_precision_extraction(
206 self, content: str, query: str, question_type: str, sources: str
207 ) -> str:
208 """Apply precision extraction based on question type."""
210 # Check if content already has a good answer in the first line
211 # first_line = content.split(".")[0].strip() # Not currently used
213 if question_type == "full_name":
214 return self._extract_full_name(content, query, sources)
215 elif question_type == "name": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 return self._extract_best_name(content, query, sources)
217 elif question_type == "single_choice": 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 return self._extract_single_answer(content, query, sources)
219 elif question_type == "dimension": 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 return self._extract_dimension(content, query, sources)
221 elif question_type == "score": 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 return self._extract_score(content, query, sources)
223 elif question_type == "temporal":
224 return self._extract_temporal(content, query, sources)
225 elif question_type == "number": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true
226 return self._extract_number(content, query, sources)
228 return content
230 def _extract_full_name(self, content: str, query: str, sources: str) -> str:
231 """Extract complete full names."""
232 # First, use LLM to identify all name variations
233 extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources.
235Question: {query}
237Content: {content[:2000]}
238Sources: {sources[:2000]}
240List all name variations found:
2411. Shortest version:
2422. Longest/most complete version:
2433. Most frequently mentioned version:
245Which is the FULL name (including middle name if present)?"""
247 try:
248 extraction = self.llm.invoke(extraction_prompt).content
250 # Extract the identified full name
251 if "full name" in extraction.lower(): 251 ↛ 267line 251 didn't jump to line 267 because the condition on line 251 was always true
252 lines = extraction.split("\n")
253 for line in lines:
254 if "full name" in line.lower() or "longest" in line.lower(): 254 ↛ 253line 254 didn't jump to line 253 because the condition on line 254 was always true
255 # Extract name from this line
256 matches = self.answer_patterns["full_name"].findall(
257 line
258 )
259 if matches:
260 # Choose the longest match
261 full_name = max(
262 matches, key=lambda x: len(x.split())
263 )
264 return f"{full_name}. {content}"
266 # Fallback: find all names and pick the longest
267 all_names = self.answer_patterns["full_name"].findall(
268 content + " " + sources
269 )
270 if all_names: 270 ↛ 292line 270 didn't jump to line 292 because the condition on line 270 was always true
271 # Group similar names and pick the longest variant
272 name_groups = {}
273 for name in all_names:
274 last_word = name.split()[-1]
275 if last_word not in name_groups:
276 name_groups[last_word] = []
277 name_groups[last_word].append(name)
279 # Find the group with the most complete name
280 best_name = ""
281 for group in name_groups.values():
282 longest_in_group = max(group, key=lambda x: len(x.split()))
283 if len(longest_in_group.split()) > len(best_name.split()): 283 ↛ 281line 283 didn't jump to line 281 because the condition on line 283 was always true
284 best_name = longest_in_group
286 if best_name: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true
287 return f"{best_name}. {content}"
289 except Exception:
290 logger.exception("Error in full name extraction")
292 return content
294 def _extract_single_answer(
295 self, content: str, query: str, sources: str
296 ) -> str:
297 """Extract a single answer when multiple options might be present."""
298 extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer.
300Question: {query}
301Content: {content[:1500]}
303Rules:
3041. If multiple items are listed, identify which ONE actually answers the question
3052. Look for the PRIMARY or FIRST mentioned item
3063. Do not include alternatives or additional options
308The single answer is:"""
310 try:
311 answer = self.llm.invoke(extraction_prompt).content.strip()
313 # Clean up the answer
314 answer = answer.split(",")[
315 0
316 ].strip() # Take only first if comma-separated
317 answer = answer.split(" and ")[
318 0
319 ].strip() # Take only first if "and"-separated
320 answer = answer.split(" or ")[
321 0
322 ].strip() # Take only first if "or"-separated
324 return f"{answer}. {content}"
326 except Exception:
327 logger.exception("Error in single answer extraction")
329 return content
331 def _extract_dimension(self, content: str, query: str, sources: str) -> str:
332 """Extract specific dimensions with correct units and context awareness."""
333 # Enhanced dimension type detection
334 dimension_types = {
335 "height": ["height", "tall", "high", "elevation", "altitude"],
336 "length": ["length", "long", "distance", "reach", "span"],
337 "width": ["width", "wide", "breadth", "diameter"],
338 "depth": ["depth", "deep", "thickness"],
339 "weight": ["weight", "weigh", "heavy", "mass"],
340 "speed": ["speed", "fast", "velocity", "mph", "kmh"],
341 "area": ["area", "square"],
342 "volume": ["volume", "cubic"],
343 }
345 query_lower = query.lower()
346 dimension_type = None
347 dimension_keywords = []
349 # Find the most specific dimension type
350 for dim_type, keywords in dimension_types.items(): 350 ↛ 357line 350 didn't jump to line 357 because the loop on line 350 didn't complete
351 matching_keywords = [kw for kw in keywords if kw in query_lower]
352 if matching_keywords: 352 ↛ 350line 352 didn't jump to line 350 because the condition on line 352 was always true
353 dimension_type = dim_type
354 dimension_keywords = matching_keywords
355 break
357 extraction_prompt = f"""Extract the EXACT measurement that answers this question.
359Question: {query}
360Content: {content[:1500]}
362Rules:
3631. Find the specific {dimension_type or "dimension"} measurement
3642. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet")
3653. Distinguish between different types of measurements:
366 - Height/tall: vertical measurements
367 - Length/long: horizontal distance
368 - Width/wide: horizontal breadth
3694. Look for context clues near the measurement
3705. If multiple measurements, choose the one that matches the question type
372The exact {dimension_type or "dimension"} is:"""
374 try:
375 answer = self.llm.invoke(extraction_prompt).content.strip()
377 # Clean and validate the answer
378 import re
380 measurement_match = re.search(
381 r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer
382 )
383 if measurement_match:
384 number, unit = measurement_match.groups()
385 clean_answer = f"{number} {unit}"
386 return f"{clean_answer}. {content}"
388 # Fallback: intelligent pattern matching
389 all_dimensions = self.answer_patterns["dimension"].findall(
390 content + " " + sources
391 )
392 if all_dimensions: 392 ↛ 471line 392 didn't jump to line 471 because the condition on line 392 was always true
393 # Score dimensions based on context and dimension type
394 scored_dimensions = []
396 for dim in all_dimensions:
397 number, unit = dim
398 dim_str = f"{number} {unit}"
399 score = 0
401 # Find the dimension in content
402 pos = content.find(dim_str)
403 if pos >= 0: 403 ↛ 455line 403 didn't jump to line 455 because the condition on line 403 was always true
404 # Get context around this measurement
405 context = content[max(0, pos - 100) : pos + 100].lower()
407 # Score based on dimension keywords in context
408 for keyword in dimension_keywords:
409 if keyword in context: 409 ↛ 408line 409 didn't jump to line 408 because the condition on line 409 was always true
410 score += 10
412 # Score based on unit appropriateness
413 unit_lower = unit.lower()
414 if ( 414 ↛ 453line 414 didn't jump to line 453 because the condition on line 414 was always true
415 (
416 dimension_type == "height"
417 and any(
418 u in unit_lower
419 for u in ["m", "meter", "ft", "feet", "cm"]
420 )
421 )
422 or (
423 dimension_type == "length"
424 and any(
425 u in unit_lower
426 for u in ["m", "meter", "km", "mile", "ft"]
427 )
428 )
429 or (
430 dimension_type == "weight"
431 and any(
432 u in unit_lower
433 for u in [
434 "kg",
435 "lb",
436 "pound",
437 "gram",
438 "ton",
439 ]
440 )
441 )
442 or (
443 dimension_type == "speed"
444 and any(
445 u in unit_lower
446 for u in ["mph", "kmh", "km/h", "m/s"]
447 )
448 )
449 ):
450 score += 5
452 # Prefer measurements closer to the beginning (more likely to be primary)
453 score += max(0, 5 - (pos / 100))
455 scored_dimensions.append((score, dim_str))
457 # Return the highest scoring dimension
458 if scored_dimensions: 458 ↛ 464line 458 didn't jump to line 464 because the condition on line 458 was always true
459 scored_dimensions.sort(key=lambda x: x[0], reverse=True)
460 best_dimension = scored_dimensions[0][1]
461 return f"{best_dimension}. {content}"
463 # Final fallback: first dimension
464 return (
465 f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}"
466 )
468 except Exception:
469 logger.exception("Error in dimension extraction")
471 return content
473 def _extract_score(self, content: str, query: str, sources: str) -> str:
474 """Extract game scores or results."""
475 # Find all score patterns
476 scores = self.answer_patterns["score"].findall(content + " " + sources)
478 if scores:
479 # Use LLM to identify the correct score
480 extraction_prompt = f"""Which score/result answers this question?
482Question: {query}
483Found scores: {scores}
484Context: {content[:1000]}
486The answer is:"""
488 try:
489 answer = self.llm.invoke(extraction_prompt).content.strip()
490 return f"{answer}. {content}"
491 except Exception:
492 # Return first score found if LLM extraction fails
493 return f"{scores[0][0]}-{scores[0][1]}. {content}"
495 return content
497 def _extract_temporal(self, content: str, query: str, sources: str) -> str:
498 """Extract dates or years."""
499 # Find all year patterns
500 years = self.answer_patterns["year"].findall(content + " " + sources)
502 if years:
503 # Use LLM to pick the right one
504 extraction_prompt = f"""Which date/year specifically answers this question?
506Question: {query}
507Found years: {set(years)}
508Context: {content[:1000]}
510The answer is:"""
512 try:
513 answer = self.llm.invoke(extraction_prompt).content.strip()
514 # Clean to just the year/date
515 year_match = self.answer_patterns["year"].search(answer)
516 if year_match: 516 ↛ 518line 516 didn't jump to line 518 because the condition on line 516 was always true
517 return f"{year_match.group()}. {content}"
518 return f"{answer}. {content}"
519 except Exception:
520 # Fallback to first found year if LLM extraction fails
521 return f"{years[0]}. {content}"
523 return content
525 def _extract_number(self, content: str, query: str, sources: str) -> str:
526 """Extract specific numbers."""
527 # Find all numbers
528 numbers = self.answer_patterns["number"].findall(
529 content + " " + sources
530 )
532 if numbers: 532 ↛ 548line 532 didn't jump to line 548 because the condition on line 532 was always true
533 extraction_prompt = f"""Which number specifically answers this question?
535Question: {query}
536Found numbers: {numbers[:10]}
537Context: {content[:1000]}
539The answer is:"""
541 try:
542 answer = self.llm.invoke(extraction_prompt).content.strip()
543 return f"{answer}. {content}"
544 except Exception:
545 # Fallback to first found number if LLM extraction fails
546 return f"{numbers[0]}. {content}"
548 return content
550 def _extract_best_name(self, content: str, query: str, sources: str) -> str:
551 """Extract the best matching name (not necessarily full)."""
552 # Find all potential names
553 names = self.answer_patterns["full_name"].findall(
554 content + " " + sources
555 )
557 if names: 557 ↛ 567line 557 didn't jump to line 567 because the condition on line 557 was always true
558 # Count frequency
559 name_counts = {}
560 for name in names:
561 name_counts[name] = name_counts.get(name, 0) + 1
563 # Get most frequent
564 best_name = max(name_counts.items(), key=lambda x: x[1])[0]
565 return f"{best_name}. {content}"
567 return content
569 def _extract_key_facts(
570 self, previous_knowledge: str, question_type: str
571 ) -> str:
572 """Extract key facts from previous knowledge."""
573 extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge:
575{previous_knowledge[:1500]}
577List the most important facts (names, numbers, dates) found:"""
579 try:
580 facts = self.llm.invoke(extraction_prompt).content
581 return facts[:500]
582 except Exception:
583 # Fallback to truncated previous knowledge if LLM extraction fails
584 return previous_knowledge[:500]