Coverage for src / local_deep_research / advanced_search_system / constraint_checking / dual_confidence_checker.py: 10%
116 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Dual confidence constraint checker implementation.
4This implementation uses dual confidence scoring (positive/negative/uncertainty)
5to evaluate constraints and make rejection decisions.
6"""
8from typing import Dict, List, Tuple
10from loguru import logger
12from ..candidates.base_candidate import Candidate
13from ..constraints.base_constraint import Constraint
14from .base_constraint_checker import (
15 BaseConstraintChecker,
16 ConstraintCheckResult,
17)
18from .evidence_analyzer import ConstraintEvidence, EvidenceAnalyzer
21class DualConfidenceChecker(BaseConstraintChecker):
22 """
23 Constraint checker using dual confidence scoring.
25 This checker:
26 1. Analyzes evidence using positive/negative/uncertainty scores
27 2. Makes rejection decisions based on confidence thresholds
28 3. Provides detailed scoring breakdown
29 """
31 def __init__(
32 self,
33 *args,
34 negative_threshold: float = 0.25, # Reject if negative evidence > 25%
35 positive_threshold: float = 0.4, # Reject if positive evidence < 40%
36 uncertainty_penalty: float = 0.2,
37 negative_weight: float = 0.5,
38 uncertainty_threshold: float = 0.6, # Re-evaluate if uncertainty > 60%
39 max_reevaluations: int = 2, # Maximum re-evaluation rounds
40 **kwargs,
41 ):
42 """
43 Initialize dual confidence checker.
45 Args:
46 negative_threshold: Threshold for negative evidence rejection
47 positive_threshold: Minimum positive evidence required
48 uncertainty_penalty: Penalty for uncertain evidence
49 negative_weight: Weight for negative evidence in scoring
50 uncertainty_threshold: Re-evaluate if uncertainty exceeds this
51 max_reevaluations: Maximum number of re-evaluation rounds
52 """
53 super().__init__(*args, **kwargs)
55 self.negative_threshold = negative_threshold
56 self.positive_threshold = positive_threshold
57 self.uncertainty_penalty = uncertainty_penalty
58 self.negative_weight = negative_weight
59 self.uncertainty_threshold = uncertainty_threshold
60 self.max_reevaluations = max_reevaluations
62 # Initialize evidence analyzer
63 self.evidence_analyzer = EvidenceAnalyzer(self.model)
65 def check_candidate(
66 self,
67 candidate: Candidate,
68 constraints: List[Constraint],
69 original_query: str = None,
70 ) -> ConstraintCheckResult:
71 """Check candidate using dual confidence analysis with LLM pre-screening."""
72 logger.info(f"Checking candidate: {candidate.name} (dual confidence)")
74 # LLM PRE-SCREENING: Check all constraints in one call to save SearXNG capacity
75 pre_screen_result = self._llm_prescreen_candidate(
76 candidate, constraints, original_query
77 )
78 if pre_screen_result["should_reject"]:
79 logger.info(
80 f"🚫 LLM pre-screen rejected {candidate.name}: {pre_screen_result['reason']}"
81 )
82 return ConstraintCheckResult(
83 should_reject=True,
84 rejection_reason=pre_screen_result["reason"],
85 total_score=0.0,
86 detailed_results=pre_screen_result["detailed_results"],
87 )
89 constraint_scores = {}
90 detailed_results = []
91 rejection_reason = None
92 should_reject = False
94 for constraint in constraints:
95 # Perform initial evaluation with re-evaluation for uncertain constraints
96 result = self._evaluate_constraint_with_reevaluation(
97 candidate, constraint
98 )
100 avg_positive = result["positive"]
101 avg_negative = result["negative"]
102 avg_uncertainty = result["uncertainty"]
103 score = result["score"]
104 reevaluation_count = result.get("reevaluation_count", 0)
106 # Check for rejection based on final results
107 reject, reason = self.should_reject_candidate_from_averages(
108 candidate, constraint, avg_positive, avg_negative
109 )
111 if reject and not should_reject: # Only record first rejection
112 should_reject = True
113 rejection_reason = reason
115 # Store results
116 constraint_scores[constraint.value] = {
117 "total": score,
118 "positive": avg_positive,
119 "negative": avg_negative,
120 "uncertainty": avg_uncertainty,
121 "weight": constraint.weight,
122 "reevaluation_count": reevaluation_count,
123 }
125 detailed_results.append(
126 {
127 "constraint": constraint.value,
128 "score": score,
129 "positive": avg_positive,
130 "negative": avg_negative,
131 "uncertainty": avg_uncertainty,
132 "weight": constraint.weight,
133 "type": constraint.type.value,
134 "reevaluation_count": reevaluation_count,
135 }
136 )
138 # Log detailed result with re-evaluation info
139 self._log_constraint_result_detailed(
140 candidate,
141 constraint,
142 score,
143 avg_positive,
144 avg_negative,
145 avg_uncertainty,
146 reevaluation_count,
147 )
149 # Calculate total score
150 if should_reject:
151 total_score = 0.0
152 else:
153 if detailed_results:
154 weights = [r["weight"] for r in detailed_results]
155 scores = [r["score"] for r in detailed_results]
156 total_score = self._calculate_weighted_score(scores, weights)
157 else:
158 total_score = 0.0
160 logger.info(f"Final score for {candidate.name}: {total_score:.2%}")
162 return ConstraintCheckResult(
163 candidate=candidate,
164 total_score=total_score,
165 constraint_scores=constraint_scores,
166 should_reject=should_reject,
167 rejection_reason=rejection_reason,
168 detailed_results=detailed_results,
169 )
171 def _evaluate_constraint_with_reevaluation(
172 self, candidate: Candidate, constraint: Constraint
173 ) -> Dict:
174 """Evaluate constraint with potential re-evaluation for uncertain results."""
175 reevaluation_count = 0
176 evidence_list = []
178 while reevaluation_count <= self.max_reevaluations:
179 # Gather evidence (fresh each time for re-evaluation)
180 evidence_list = self._gather_evidence_for_constraint(
181 candidate, constraint
182 )
184 if not evidence_list:
185 # No evidence found
186 return {
187 "positive": 0.0,
188 "negative": 0.0,
189 "uncertainty": 1.0,
190 "score": 0.5 - self.uncertainty_penalty,
191 "evidence_list": [],
192 "reevaluation_count": reevaluation_count,
193 }
195 # Analyze with dual confidence
196 dual_evidence = [
197 self.evidence_analyzer.analyze_evidence_dual_confidence(
198 e, constraint
199 )
200 for e in evidence_list
201 ]
203 # Calculate averages
204 avg_positive = sum(
205 e.positive_confidence for e in dual_evidence
206 ) / len(dual_evidence)
207 avg_negative = sum(
208 e.negative_confidence for e in dual_evidence
209 ) / len(dual_evidence)
210 avg_uncertainty = sum(e.uncertainty for e in dual_evidence) / len(
211 dual_evidence
212 )
214 # Calculate score
215 score = self.evidence_analyzer.evaluate_evidence_list(
216 evidence_list,
217 constraint,
218 self.uncertainty_penalty,
219 self.negative_weight,
220 )
222 # Check if we need re-evaluation
223 if (
224 reevaluation_count < self.max_reevaluations
225 and avg_uncertainty > self.uncertainty_threshold
226 and not self._should_early_reject(avg_positive, avg_negative)
227 ):
228 reevaluation_count += 1
229 logger.info(
230 f"🔄 Re-evaluating {candidate.name} | {constraint.value} "
231 f"(round {reevaluation_count}) - high uncertainty: {avg_uncertainty:.0%}"
232 )
233 continue
234 else:
235 # Final result or early rejection
236 if reevaluation_count > 0:
237 logger.info(
238 f"✅ Final evaluation for {candidate.name} | {constraint.value} "
239 f"after {reevaluation_count} re-evaluation(s)"
240 )
242 return {
243 "positive": avg_positive,
244 "negative": avg_negative,
245 "uncertainty": avg_uncertainty,
246 "score": score,
247 "evidence_list": evidence_list,
248 "reevaluation_count": reevaluation_count,
249 }
251 # Should not reach here, but fallback
252 return {
253 "positive": avg_positive,
254 "negative": avg_negative,
255 "uncertainty": avg_uncertainty,
256 "score": score,
257 "evidence_list": evidence_list,
258 "reevaluation_count": reevaluation_count,
259 }
261 def _should_early_reject(
262 self, avg_positive: float, avg_negative: float
263 ) -> bool:
264 """Check if candidate should be rejected early (before re-evaluation)."""
265 return (
266 avg_negative > self.negative_threshold
267 or avg_positive < self.positive_threshold
268 )
270 def should_reject_candidate_from_averages(
271 self,
272 candidate: Candidate,
273 constraint: Constraint,
274 avg_positive: float,
275 avg_negative: float,
276 ) -> Tuple[bool, str]:
277 """Determine rejection based on average confidence scores."""
278 # PRIMARY REJECTION: High negative evidence
279 if avg_negative > self.negative_threshold:
280 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"
281 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
282 return True, reason
284 # SECONDARY REJECTION: Low positive evidence
285 if avg_positive < self.positive_threshold:
286 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"
287 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
288 return True, reason
290 return False, ""
292 def should_reject_candidate(
293 self,
294 candidate: Candidate,
295 constraint: Constraint,
296 dual_evidence: List[ConstraintEvidence],
297 ) -> Tuple[bool, str]:
298 """Determine rejection based on dual confidence scores."""
299 if not dual_evidence:
300 return False, ""
302 # Calculate averages
303 avg_positive = sum(e.positive_confidence for e in dual_evidence) / len(
304 dual_evidence
305 )
306 avg_negative = sum(e.negative_confidence for e in dual_evidence) / len(
307 dual_evidence
308 )
310 # PRIMARY REJECTION: High negative evidence
311 if avg_negative > self.negative_threshold:
312 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"
313 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
314 return True, reason
316 # SECONDARY REJECTION: Low positive evidence
317 if avg_positive < self.positive_threshold:
318 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"
319 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")
320 return True, reason
322 return False, ""
324 def _log_constraint_result_detailed(
325 self,
326 candidate,
327 constraint,
328 score,
329 positive,
330 negative,
331 uncertainty,
332 reevaluation_count=0,
333 ):
334 """Log detailed constraint result."""
335 symbol = "✓" if score >= 0.8 else "○" if score >= 0.5 else "✗"
337 # Add re-evaluation indicator
338 reeval_indicator = (
339 f" [R{reevaluation_count}]" if reevaluation_count > 0 else ""
340 )
342 logger.info(
343 f"{symbol} {candidate.name} | {constraint.value}: {int(score * 100)}% "
344 f"(+{int(positive * 100)}% -{int(negative * 100)}% ?{int(uncertainty * 100)}%){reeval_indicator}"
345 )
347 def _llm_prescreen_candidate(
348 self, candidate, constraints, original_query=None
349 ):
350 """Simple quality check for answer candidates."""
352 if not original_query:
353 return {
354 "should_reject": False,
355 "reason": "No original query provided",
356 "detailed_results": [],
357 }
359 prompt = f"""Question: {original_query}
360Answer: {candidate.name}
362Is this a good answer to the question? Rate 0-100 where:
363- 90-100: Excellent direct answer
364- 70-89: Good answer
365- 50-69: Partial answer
366- 30-49: Weak answer
367- 0-29: Poor/wrong answer
369Just give the number:"""
371 try:
372 response = self.model.generate(prompt)
374 # Parse confidence score
375 import re
377 confidence_match = re.search(r"(\d{1,3})", response.strip())
379 if confidence_match:
380 quality_score = int(confidence_match.group(1))
382 # Accept good answers (50+ out of 100)
383 if quality_score >= 50:
384 return {
385 "should_reject": False,
386 "reason": f"Good answer quality: {quality_score}%",
387 "detailed_results": [
388 {
389 "constraint": "answer_quality",
390 "positive_confidence": quality_score / 100.0,
391 "source": "answer_quality_check",
392 }
393 ],
394 }
395 else:
396 return {
397 "should_reject": True,
398 "reason": f"Poor answer quality: {quality_score}%",
399 "detailed_results": [
400 {
401 "constraint": "answer_quality",
402 "negative_confidence": (100 - quality_score)
403 / 100.0,
404 "source": "answer_quality_check",
405 }
406 ],
407 }
409 # Parsing failed - accept by default
410 return {
411 "should_reject": False,
412 "reason": "Could not parse quality score - accepting",
413 "detailed_results": [],
414 }
416 except Exception as e:
417 logger.warning(
418 f"Fast LLM pre-screening failed for {candidate.name}: {e}"
419 )
420 return {
421 "should_reject": False,
422 "reason": "",
423 "detailed_results": [],
424 }