Coverage for src/local_deep_research/citation_handlers/precision_extraction

1"""

2Precision Extraction Citation Handler

4This handler focuses on extracting precise, complete answers for SimpleQA-style questions.

5It includes specialized extractors for:

6- Full names (including middle names)

7- Single answers when only one is requested

8- Dimension-aware measurements

9- Specific entities without extra information

10"""

12import re

13from datetime import datetime, timezone

14from typing import Any, Dict, List, Union

16from loguru import logger

18from .base_citation_handler import BaseCitationHandler

21class PrecisionExtractionHandler(BaseCitationHandler):

22 """Citation handler optimized for precise answer extraction."""

24 def __init__(self, *args, **kwargs):

25 super().__init__(*args, **kwargs)

27 # Answer type patterns

28 self.answer_patterns = {

29 "full_name": re.compile(

30 r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b"

31 ),

32 "year": re.compile(r"\b(19\d{2}|20\d{2})\b"),

33 "number": re.compile(r"\b(\d+(?:\.\d+)?)\b"),

34 "dimension": re.compile(

35 r"(\d+(?:\.\d+)?)\s*(meters?|feet|inches|cm|km|miles?|m|ft|kg|pounds?|lbs?)",

36 re.I,

37 ),

38 "score": re.compile(r"(\d+)\s*[-–]\s*(\d+)"),

39 "percentage": re.compile(r"(\d+(?:\.\d+)?)\s*%"),

40 "location": re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b"),

41 }

43 def analyze_initial(

44 self, query: str, search_results: Union[str, List[Dict]]

45 ) -> Dict[str, Any]:

46 """Initial analysis with precision extraction."""

47 documents = self._create_documents(search_results)

48 formatted_sources = self._format_sources(documents)

50 # Determine question type for targeted extraction

51 question_type = self._identify_question_type(query)

53 current_timestamp = datetime.now(timezone.utc).strftime(

54 "%Y-%m-%d %H:%M"

55 )

57 output_prefix = self._get_output_instruction_prefix()

59 prompt = f"""{output_prefix}Analyze the following information and provide a PRECISE answer to the question. Include citations using numbers in square brackets [1], [2], etc.

61Question: {query}

62Question Type: {question_type}

64Sources:

65{formatted_sources}

67Current time is {current_timestamp} UTC for verifying temporal references in sources.

69PRECISION INSTRUCTIONS:

701. Extract the EXACT answer as it appears in the sources

712. For names: Include FULL names with all parts (first, middle, last)

723. For numbers: Include exact values with units if present

734. For single-answer questions: Provide ONLY ONE answer, not multiple options

745. For dimensions: Specify the exact measurement type (height, length, width)

756. Citations should support the specific answer given

77Format: Start with the direct, precise answer, then explain with citations."""

79 response = self.llm.invoke(prompt)

80 if not isinstance(response, str):

81 response = response.content

83 # Apply precision extraction if needed

84 response = self._apply_precision_extraction(

85 response, query, question_type, formatted_sources

86 )

88 return {"content": response, "documents": documents}

90 def analyze_followup(

91 self,

92 question: str,

93 search_results: Union[str, List[Dict]],

94 previous_knowledge: str,

95 nr_of_links: int,

96 ) -> Dict[str, Any]:

97 """Follow-up analysis with precision extraction."""

98 documents = self._create_documents(

99 search_results, nr_of_links=nr_of_links

100 )

101 formatted_sources = self._format_sources(documents)

102

103 question_type = self._identify_question_type(question)

104

105 # Extract key facts from previous knowledge

106 key_facts = self._extract_key_facts(previous_knowledge, question_type)

107

108 current_timestamp = datetime.now(timezone.utc).strftime(

109 "%Y-%m-%d %H:%M"

110 )

111

112 output_prefix = self._get_output_instruction_prefix()

113

114 prompt = f"""{output_prefix}Using the previous knowledge and new sources, provide a PRECISE answer to the question.

115

116Previous Key Facts:

117{key_facts}

118

119Question: {question}

120Question Type: {question_type}

121

122New Sources:

123{formatted_sources}

124

125Current time is {current_timestamp} UTC for verifying temporal references in sources.

126

127PRECISION REQUIREMENTS:

1281. Build on previous knowledge to provide the MOST COMPLETE answer

1292. If a full name was partially found before, complete it now

1303. If multiple candidates exist, select the one with the MOST evidence

1314. For measurements, ensure units and dimension types match the question

1325. Reconcile any conflicts by choosing the most frequently cited answer

133

134Provide the precise answer with citations."""

135

136 response = self.llm.invoke(prompt)

137 content = response.content

138

139 # Apply precision extraction

140 content = self._apply_precision_extraction(

141 content, question, question_type, formatted_sources

142 )

143

144 return {"content": content, "documents": documents}

145

146 def _identify_question_type(self, query: str) -> str:

147 """Identify the type of question for targeted extraction."""

148 query_lower = query.lower()

149

150 # Name questions

151 if any(

152 phrase in query_lower

153 for phrase in ["full name", "name of", "who was", "who is"]

154 ):

155 if "full name" in query_lower:

156 return "full_name"

157 return "name"

158

159 # Location questions

160 if any(

161 phrase in query_lower

162 for phrase in ["where", "location", "city", "country", "place"]

163 ):

164 return "location"

165

166 # Temporal questions

167 if any(phrase in query_lower for phrase in ["when", "year", "date"]):

168 return "temporal"

169

170 # Numerical questions

171 if any(

172 phrase in query_lower

173 for phrase in ["how many", "how much", "number", "count"]

174 ):

175 return "number"

176

177 # Score/result questions

178 if any(

179 phrase in query_lower

180 for phrase in ["score", "result", "final", "outcome"]

181 ):

182 return "score"

183

184 # Dimension questions

185 if any(

186 phrase in query_lower

187 for phrase in [

188 "height",

189 "length",

190 "width",

191 "size",

192 "tall",

193 "long",

194 "wide",

195 ]

196 ):

197 return "dimension"

198

199 # Single answer questions

200 if query_lower.startswith("which") and "one" in query_lower:

201 return "single_choice"

202

203 return "general"

204

205 def _apply_precision_extraction(

206 self, content: str, query: str, question_type: str, sources: str

207 ) -> str:

208 """Apply precision extraction based on question type."""

209

210 # Check if content already has a good answer in the first line

211 # first_line = content.split(".")[0].strip() # Not currently used

212

213 if question_type == "full_name":

214 return self._extract_full_name(content, query, sources)

215 elif question_type == "name": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 return self._extract_best_name(content, query, sources)

217 elif question_type == "single_choice": 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 return self._extract_single_answer(content, query, sources)

219 elif question_type == "dimension": 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 return self._extract_dimension(content, query, sources)

221 elif question_type == "score": 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 return self._extract_score(content, query, sources)

223 elif question_type == "temporal":

224 return self._extract_temporal(content, query, sources)

225 elif question_type == "number": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 return self._extract_number(content, query, sources)

227

228 return content

229

230 def _extract_full_name(self, content: str, query: str, sources: str) -> str:

231 """Extract complete full names."""

232 # First, use LLM to identify all name variations

233 extraction_prompt = f"""Find ALL variations of the person's name mentioned in the sources.

234

235Question: {query}

236

237Content: {content[:2000]}

238Sources: {sources[:2000]}

239

240List all name variations found:

2411. Shortest version:

2422. Longest/most complete version:

2433. Most frequently mentioned version:

244

245Which is the FULL name (including middle name if present)?"""

246

247 try:

248 extraction = self.llm.invoke(extraction_prompt).content

249

250 # Extract the identified full name

251 if "full name" in extraction.lower(): 251 ↛ 267line 251 didn't jump to line 267 because the condition on line 251 was always true

252 lines = extraction.split("\n")

253 for line in lines:

254 if "full name" in line.lower() or "longest" in line.lower(): 254 ↛ 253line 254 didn't jump to line 253 because the condition on line 254 was always true

255 # Extract name from this line

256 matches = self.answer_patterns["full_name"].findall(

257 line

258 )

259 if matches:

260 # Choose the longest match

261 full_name = max(

262 matches, key=lambda x: len(x.split())

263 )

264 return f"{full_name}. {content}"

265

266 # Fallback: find all names and pick the longest

267 all_names = self.answer_patterns["full_name"].findall(

268 content + " " + sources

269 )

270 if all_names: 270 ↛ 292line 270 didn't jump to line 292 because the condition on line 270 was always true

271 # Group similar names and pick the longest variant

272 name_groups = {}

273 for name in all_names:

274 last_word = name.split()[-1]

275 if last_word not in name_groups:

276 name_groups[last_word] = []

277 name_groups[last_word].append(name)

278

279 # Find the group with the most complete name

280 best_name = ""

281 for group in name_groups.values():

282 longest_in_group = max(group, key=lambda x: len(x.split()))

283 if len(longest_in_group.split()) > len(best_name.split()): 283 ↛ 281line 283 didn't jump to line 281 because the condition on line 283 was always true

284 best_name = longest_in_group

285

286 if best_name: 286 ↛ 292line 286 didn't jump to line 292 because the condition on line 286 was always true

287 return f"{best_name}. {content}"

288

289 except Exception:

290 logger.exception("Error in full name extraction")

291

292 return content

293

294 def _extract_single_answer(

295 self, content: str, query: str, sources: str

296 ) -> str:

297 """Extract a single answer when multiple options might be present."""

298 extraction_prompt = f"""The question asks for ONE specific answer. Extract ONLY that answer.

299

300Question: {query}

301Content: {content[:1500]}

302

303Rules:

3041. If multiple items are listed, identify which ONE actually answers the question

3052. Look for the PRIMARY or FIRST mentioned item

3063. Do not include alternatives or additional options

307

308The single answer is:"""

309

310 try:

311 answer = self.llm.invoke(extraction_prompt).content.strip()

312

313 # Clean up the answer

314 answer = answer.split(",")[

315 0

316 ].strip() # Take only first if comma-separated

317 answer = answer.split(" and ")[

318 0

319 ].strip() # Take only first if "and"-separated

320 answer = answer.split(" or ")[

321 0

322 ].strip() # Take only first if "or"-separated

323

324 return f"{answer}. {content}"

325

326 except Exception:

327 logger.exception("Error in single answer extraction")

328

329 return content

330

331 def _extract_dimension(self, content: str, query: str, sources: str) -> str:

332 """Extract specific dimensions with correct units and context awareness."""

333 # Enhanced dimension type detection

334 dimension_types = {

335 "height": ["height", "tall", "high", "elevation", "altitude"],

336 "length": ["length", "long", "distance", "reach", "span"],

337 "width": ["width", "wide", "breadth", "diameter"],

338 "depth": ["depth", "deep", "thickness"],

339 "weight": ["weight", "weigh", "heavy", "mass"],

340 "speed": ["speed", "fast", "velocity", "mph", "kmh"],

341 "area": ["area", "square"],

342 "volume": ["volume", "cubic"],

343 }

344

345 query_lower = query.lower()

346 dimension_type = None

347 dimension_keywords = []

348

349 # Find the most specific dimension type

350 for dim_type, keywords in dimension_types.items(): 350 ↛ 357line 350 didn't jump to line 357 because the loop on line 350 didn't complete

351 matching_keywords = [kw for kw in keywords if kw in query_lower]

352 if matching_keywords: 352 ↛ 350line 352 didn't jump to line 350 because the condition on line 352 was always true

353 dimension_type = dim_type

354 dimension_keywords = matching_keywords

355 break

356

357 extraction_prompt = f"""Extract the EXACT measurement that answers this question.

358

359Question: {query}

360Content: {content[:1500]}

361

362Rules:

3631. Find the specific {dimension_type or "dimension"} measurement

3642. Return ONLY the number and unit (e.g., "20 meters", "5.5 feet")

3653. Distinguish between different types of measurements:

366 - Height/tall: vertical measurements

367 - Length/long: horizontal distance

368 - Width/wide: horizontal breadth

3694. Look for context clues near the measurement

3705. If multiple measurements, choose the one that matches the question type

371

372The exact {dimension_type or "dimension"} is:"""

373

374 try:

375 answer = self.llm.invoke(extraction_prompt).content.strip()

376

377 # Clean and validate the answer

378 import re

379

380 measurement_match = re.search(

381 r"(\d+(?:\.\d+)?)\s*([a-zA-Z/°]+)", answer

382 )

383 if measurement_match:

384 number, unit = measurement_match.groups()

385 clean_answer = f"{number} {unit}"

386 return f"{clean_answer}. {content}"

387

388 # Fallback: intelligent pattern matching

389 all_dimensions = self.answer_patterns["dimension"].findall(

390 content + " " + sources

391 )

392 if all_dimensions: 392 ↛ 471line 392 didn't jump to line 471 because the condition on line 392 was always true

393 # Score dimensions based on context and dimension type

394 scored_dimensions = []

395

396 for dim in all_dimensions:

397 number, unit = dim

398 dim_str = f"{number} {unit}"

399 score = 0

400

401 # Find the dimension in content

402 pos = content.find(dim_str)

403 if pos >= 0: 403 ↛ 455line 403 didn't jump to line 455 because the condition on line 403 was always true

404 # Get context around this measurement

405 context = content[max(0, pos - 100) : pos + 100].lower()

406

407 # Score based on dimension keywords in context

408 for keyword in dimension_keywords:

409 if keyword in context: 409 ↛ 408line 409 didn't jump to line 408 because the condition on line 409 was always true

410 score += 10

411

412 # Score based on unit appropriateness

413 unit_lower = unit.lower()

414 if ( 414 ↛ 453line 414 didn't jump to line 453 because the condition on line 414 was always true

415 (

416 dimension_type == "height"

417 and any(

418 u in unit_lower

419 for u in ["m", "meter", "ft", "feet", "cm"]

420 )

421 )

422 or (

423 dimension_type == "length"

424 and any(

425 u in unit_lower

426 for u in ["m", "meter", "km", "mile", "ft"]

427 )

428 )

429 or (

430 dimension_type == "weight"

431 and any(

432 u in unit_lower

433 for u in [

434 "kg",

435 "lb",

436 "pound",

437 "gram",

438 "ton",

439 ]

440 )

441 )

442 or (

443 dimension_type == "speed"

444 and any(

445 u in unit_lower

446 for u in ["mph", "kmh", "km/h", "m/s"]

447 )

448 )

449 ):

450 score += 5

451

452 # Prefer measurements closer to the beginning (more likely to be primary)

453 score += max(0, 5 - (pos / 100))

454

455 scored_dimensions.append((score, dim_str))

456

457 # Return the highest scoring dimension

458 if scored_dimensions: 458 ↛ 464line 458 didn't jump to line 464 because the condition on line 458 was always true

459 scored_dimensions.sort(key=lambda x: x[0], reverse=True)

460 best_dimension = scored_dimensions[0][1]

461 return f"{best_dimension}. {content}"

462

463 # Final fallback: first dimension

464 return (

465 f"{all_dimensions[0][0]} {all_dimensions[0][1]}. {content}"

466 )

467

468 except Exception:

469 logger.exception("Error in dimension extraction")

470

471 return content

472

473 def _extract_score(self, content: str, query: str, sources: str) -> str:

474 """Extract game scores or results."""

475 # Find all score patterns

476 scores = self.answer_patterns["score"].findall(content + " " + sources)

477

478 if scores:

479 # Use LLM to identify the correct score

480 extraction_prompt = f"""Which score/result answers this question?

481

482Question: {query}

483Found scores: {scores}

484Context: {content[:1000]}

485

486The answer is:"""

487

488 try:

489 answer = self.llm.invoke(extraction_prompt).content.strip()

490 return f"{answer}. {content}"

491 except Exception:

492 # Return first score found if LLM extraction fails

493 return f"{scores[0][0]}-{scores[0][1]}. {content}"

494

495 return content

496

497 def _extract_temporal(self, content: str, query: str, sources: str) -> str:

498 """Extract dates or years."""

499 # Find all year patterns

500 years = self.answer_patterns["year"].findall(content + " " + sources)

501

502 if years:

503 # Use LLM to pick the right one

504 extraction_prompt = f"""Which date/year specifically answers this question?

505

506Question: {query}

507Found years: {set(years)}

508Context: {content[:1000]}

509

510The answer is:"""

511

512 try:

513 answer = self.llm.invoke(extraction_prompt).content.strip()

514 # Clean to just the year/date

515 year_match = self.answer_patterns["year"].search(answer)

516 if year_match: 516 ↛ 518line 516 didn't jump to line 518 because the condition on line 516 was always true

517 return f"{year_match.group()}. {content}"

518 return f"{answer}. {content}"

519 except Exception:

520 # Fallback to first found year if LLM extraction fails

521 return f"{years[0]}. {content}"

522

523 return content

524

525 def _extract_number(self, content: str, query: str, sources: str) -> str:

526 """Extract specific numbers."""

527 # Find all numbers

528 numbers = self.answer_patterns["number"].findall(

529 content + " " + sources

530 )

531

532 if numbers: 532 ↛ 548line 532 didn't jump to line 548 because the condition on line 532 was always true

533 extraction_prompt = f"""Which number specifically answers this question?

534

535Question: {query}

536Found numbers: {numbers[:10]}

537Context: {content[:1000]}

538

539The answer is:"""

540

541 try:

542 answer = self.llm.invoke(extraction_prompt).content.strip()

543 return f"{answer}. {content}"

544 except Exception:

545 # Fallback to first found number if LLM extraction fails

546 return f"{numbers[0]}. {content}"

547

548 return content

549

550 def _extract_best_name(self, content: str, query: str, sources: str) -> str:

551 """Extract the best matching name (not necessarily full)."""

552 # Find all potential names

553 names = self.answer_patterns["full_name"].findall(

554 content + " " + sources

555 )

556

557 if names: 557 ↛ 567line 557 didn't jump to line 567 because the condition on line 557 was always true

558 # Count frequency

559 name_counts = {}

560 for name in names:

561 name_counts[name] = name_counts.get(name, 0) + 1

562

563 # Get most frequent

564 best_name = max(name_counts.items(), key=lambda x: x[1])[0]

565 return f"{best_name}. {content}"

566

567 return content

568

569 def _extract_key_facts(

570 self, previous_knowledge: str, question_type: str

571 ) -> str:

572 """Extract key facts from previous knowledge."""

573 extraction_prompt = f"""Extract key facts related to a {question_type} question from this knowledge:

574

575{previous_knowledge[:1500]}

576

577List the most important facts (names, numbers, dates) found:"""

578

579 try:

580 facts = self.llm.invoke(extraction_prompt).content

581 return facts[:500]

582 except Exception:

583 # Fallback to truncated previous knowledge if LLM extraction fails

584 return previous_knowledge[:500]

Coverage for src / local_deep_research / citation_handlers / precision_extraction_handler.py: 86%

204 statements