Coverage for src / local_deep_research / advanced_search_system / constraint_checking / dual_confidence_checker.py: 90%

116 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Dual confidence constraint checker implementation. 

3 

4This implementation uses dual confidence scoring (positive/negative/uncertainty) 

5to evaluate constraints and make rejection decisions. 

6""" 

7 

8from typing import Dict, List, Tuple 

9 

10from loguru import logger 

11 

12from ..candidates.base_candidate import Candidate 

13from ..constraints.base_constraint import Constraint 

14from .base_constraint_checker import ( 

15 BaseConstraintChecker, 

16 ConstraintCheckResult, 

17) 

18from .evidence_analyzer import ConstraintEvidence, EvidenceAnalyzer 

19 

20 

21class DualConfidenceChecker(BaseConstraintChecker): 

22 """ 

23 Constraint checker using dual confidence scoring. 

24 

25 This checker: 

26 1. Analyzes evidence using positive/negative/uncertainty scores 

27 2. Makes rejection decisions based on confidence thresholds 

28 3. Provides detailed scoring breakdown 

29 """ 

30 

31 def __init__( 

32 self, 

33 *args, 

34 negative_threshold: float = 0.25, # Reject if negative evidence > 25% 

35 positive_threshold: float = 0.4, # Reject if positive evidence < 40% 

36 uncertainty_penalty: float = 0.2, 

37 negative_weight: float = 0.5, 

38 uncertainty_threshold: float = 0.6, # Re-evaluate if uncertainty > 60% 

39 max_reevaluations: int = 2, # Maximum re-evaluation rounds 

40 **kwargs, 

41 ): 

42 """ 

43 Initialize dual confidence checker. 

44 

45 Args: 

46 negative_threshold: Threshold for negative evidence rejection 

47 positive_threshold: Minimum positive evidence required 

48 uncertainty_penalty: Penalty for uncertain evidence 

49 negative_weight: Weight for negative evidence in scoring 

50 uncertainty_threshold: Re-evaluate if uncertainty exceeds this 

51 max_reevaluations: Maximum number of re-evaluation rounds 

52 """ 

53 super().__init__(*args, **kwargs) 

54 

55 self.negative_threshold = negative_threshold 

56 self.positive_threshold = positive_threshold 

57 self.uncertainty_penalty = uncertainty_penalty 

58 self.negative_weight = negative_weight 

59 self.uncertainty_threshold = uncertainty_threshold 

60 self.max_reevaluations = max_reevaluations 

61 

62 # Initialize evidence analyzer 

63 self.evidence_analyzer = EvidenceAnalyzer(self.model) 

64 

65 def check_candidate( 

66 self, 

67 candidate: Candidate, 

68 constraints: List[Constraint], 

69 original_query: str = None, 

70 ) -> ConstraintCheckResult: 

71 """Check candidate using dual confidence analysis with LLM pre-screening.""" 

72 logger.info(f"Checking candidate: {candidate.name} (dual confidence)") 

73 

74 # LLM PRE-SCREENING: Check all constraints in one call to save SearXNG capacity 

75 pre_screen_result = self._llm_prescreen_candidate( 

76 candidate, constraints, original_query 

77 ) 

78 if pre_screen_result["should_reject"]: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 logger.info( 

80 f"🚫 LLM pre-screen rejected {candidate.name}: {pre_screen_result['reason']}" 

81 ) 

82 return ConstraintCheckResult( 

83 candidate=candidate, 

84 constraint_scores={}, 

85 should_reject=True, 

86 rejection_reason=pre_screen_result["reason"], 

87 total_score=0.0, 

88 detailed_results=pre_screen_result["detailed_results"], 

89 ) 

90 

91 constraint_scores = {} 

92 detailed_results = [] 

93 rejection_reason = None 

94 should_reject = False 

95 

96 for constraint in constraints: 

97 # Perform initial evaluation with re-evaluation for uncertain constraints 

98 result = self._evaluate_constraint_with_reevaluation( 

99 candidate, constraint 

100 ) 

101 

102 avg_positive = result["positive"] 

103 avg_negative = result["negative"] 

104 avg_uncertainty = result["uncertainty"] 

105 score = result["score"] 

106 reevaluation_count = result.get("reevaluation_count", 0) 

107 

108 # Check for rejection based on final results 

109 reject, reason = self.should_reject_candidate_from_averages( 

110 candidate, constraint, avg_positive, avg_negative 

111 ) 

112 

113 if reject and not should_reject: # Only record first rejection 

114 should_reject = True 

115 rejection_reason = reason 

116 

117 # Store results 

118 constraint_scores[constraint.value] = { 

119 "total": score, 

120 "positive": avg_positive, 

121 "negative": avg_negative, 

122 "uncertainty": avg_uncertainty, 

123 "weight": constraint.weight, 

124 "reevaluation_count": reevaluation_count, 

125 } 

126 

127 detailed_results.append( 

128 { 

129 "constraint": constraint.value, 

130 "score": score, 

131 "positive": avg_positive, 

132 "negative": avg_negative, 

133 "uncertainty": avg_uncertainty, 

134 "weight": constraint.weight, 

135 "type": constraint.type.value, 

136 "reevaluation_count": reevaluation_count, 

137 } 

138 ) 

139 

140 # Log detailed result with re-evaluation info 

141 self._log_constraint_result_detailed( 

142 candidate, 

143 constraint, 

144 score, 

145 avg_positive, 

146 avg_negative, 

147 avg_uncertainty, 

148 reevaluation_count, 

149 ) 

150 

151 # Calculate total score 

152 if should_reject: 

153 total_score = 0.0 

154 else: 

155 if detailed_results: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 weights = [r["weight"] for r in detailed_results] 

157 scores = [r["score"] for r in detailed_results] 

158 total_score = self._calculate_weighted_score(scores, weights) 

159 else: 

160 total_score = 0.0 

161 

162 logger.info(f"Final score for {candidate.name}: {total_score:.2%}") 

163 

164 return ConstraintCheckResult( 

165 candidate=candidate, 

166 total_score=total_score, 

167 constraint_scores=constraint_scores, 

168 should_reject=should_reject, 

169 rejection_reason=rejection_reason, 

170 detailed_results=detailed_results, 

171 ) 

172 

173 def _evaluate_constraint_with_reevaluation( 

174 self, candidate: Candidate, constraint: Constraint 

175 ) -> Dict: 

176 """Evaluate constraint with potential re-evaluation for uncertain results.""" 

177 reevaluation_count = 0 

178 evidence_list = [] 

179 

180 while reevaluation_count <= self.max_reevaluations: 180 ↛ 253line 180 didn't jump to line 253 because the condition on line 180 was always true

181 # Gather evidence (fresh each time for re-evaluation) 

182 evidence_list = self._gather_evidence_for_constraint( 

183 candidate, constraint 

184 ) 

185 

186 if not evidence_list: 

187 # No evidence found 

188 return { 

189 "positive": 0.0, 

190 "negative": 0.0, 

191 "uncertainty": 1.0, 

192 "score": 0.5 - self.uncertainty_penalty, 

193 "evidence_list": [], 

194 "reevaluation_count": reevaluation_count, 

195 } 

196 

197 # Analyze with dual confidence 

198 dual_evidence = [ 

199 self.evidence_analyzer.analyze_evidence_dual_confidence( 

200 e, constraint 

201 ) 

202 for e in evidence_list 

203 ] 

204 

205 # Calculate averages 

206 avg_positive = sum( 

207 e.positive_confidence for e in dual_evidence 

208 ) / len(dual_evidence) 

209 avg_negative = sum( 

210 e.negative_confidence for e in dual_evidence 

211 ) / len(dual_evidence) 

212 avg_uncertainty = sum(e.uncertainty for e in dual_evidence) / len( 

213 dual_evidence 

214 ) 

215 

216 # Calculate score 

217 score = self.evidence_analyzer.evaluate_evidence_list( 

218 evidence_list, 

219 constraint, 

220 self.uncertainty_penalty, 

221 self.negative_weight, 

222 ) 

223 

224 # Check if we need re-evaluation 

225 if ( 225 ↛ 230line 225 didn't jump to line 230 because the condition on line 225 was never true

226 reevaluation_count < self.max_reevaluations 

227 and avg_uncertainty > self.uncertainty_threshold 

228 and not self._should_early_reject(avg_positive, avg_negative) 

229 ): 

230 reevaluation_count += 1 

231 logger.info( 

232 f"🔄 Re-evaluating {candidate.name} | {constraint.value} " 

233 f"(round {reevaluation_count}) - high uncertainty: {avg_uncertainty:.0%}" 

234 ) 

235 continue 

236 # Final result or early rejection 

237 if reevaluation_count > 0: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 logger.info( 

239 f"✅ Final evaluation for {candidate.name} | {constraint.value} " 

240 f"after {reevaluation_count} re-evaluation(s)" 

241 ) 

242 

243 return { 

244 "positive": avg_positive, 

245 "negative": avg_negative, 

246 "uncertainty": avg_uncertainty, 

247 "score": score, 

248 "evidence_list": evidence_list, 

249 "reevaluation_count": reevaluation_count, 

250 } 

251 

252 # Should not reach here, but fallback 

253 return { 

254 "positive": avg_positive, 

255 "negative": avg_negative, 

256 "uncertainty": avg_uncertainty, 

257 "score": score, 

258 "evidence_list": evidence_list, 

259 "reevaluation_count": reevaluation_count, 

260 } 

261 

262 def _should_early_reject( 

263 self, avg_positive: float, avg_negative: float 

264 ) -> bool: 

265 """Check if candidate should be rejected early (before re-evaluation).""" 

266 return ( 

267 avg_negative > self.negative_threshold 

268 or avg_positive < self.positive_threshold 

269 ) 

270 

271 def should_reject_candidate_from_averages( 

272 self, 

273 candidate: Candidate, 

274 constraint: Constraint, 

275 avg_positive: float, 

276 avg_negative: float, 

277 ) -> Tuple[bool, str]: 

278 """Determine rejection based on average confidence scores.""" 

279 # PRIMARY REJECTION: High negative evidence 

280 if avg_negative > self.negative_threshold: 

281 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'" 

282 logger.info(f"❌ REJECTION: {candidate.name} - {reason}") 

283 return True, reason 

284 

285 # SECONDARY REJECTION: Low positive evidence 

286 if avg_positive < self.positive_threshold: 

287 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'" 

288 logger.info(f"❌ REJECTION: {candidate.name} - {reason}") 

289 return True, reason 

290 

291 return False, "" 

292 

293 def should_reject_candidate( 

294 self, 

295 candidate: Candidate, 

296 constraint: Constraint, 

297 dual_evidence: List[ConstraintEvidence], 

298 ) -> Tuple[bool, str]: 

299 """Determine rejection based on dual confidence scores.""" 

300 if not dual_evidence: 

301 return False, "" 

302 

303 # Calculate averages 

304 avg_positive = sum(e.positive_confidence for e in dual_evidence) / len( 

305 dual_evidence 

306 ) 

307 avg_negative = sum(e.negative_confidence for e in dual_evidence) / len( 

308 dual_evidence 

309 ) 

310 

311 # PRIMARY REJECTION: High negative evidence 

312 if avg_negative > self.negative_threshold: 

313 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'" 

314 logger.info(f"❌ REJECTION: {candidate.name} - {reason}") 

315 return True, reason 

316 

317 # SECONDARY REJECTION: Low positive evidence 

318 if avg_positive < self.positive_threshold: 

319 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'" 

320 logger.info(f"❌ REJECTION: {candidate.name} - {reason}") 

321 return True, reason 

322 

323 return False, "" 

324 

325 def _log_constraint_result_detailed( 

326 self, 

327 candidate, 

328 constraint, 

329 score, 

330 positive, 

331 negative, 

332 uncertainty, 

333 reevaluation_count=0, 

334 ): 

335 """Log detailed constraint result.""" 

336 symbol = "✓" if score >= 0.8 else "○" if score >= 0.5 else "✗" 

337 

338 # Add re-evaluation indicator 

339 reeval_indicator = ( 

340 f" [R{reevaluation_count}]" if reevaluation_count > 0 else "" 

341 ) 

342 

343 logger.info( 

344 f"{symbol} {candidate.name} | {constraint.value}: {int(score * 100)}% " 

345 f"(+{int(positive * 100)}% -{int(negative * 100)}% ?{int(uncertainty * 100)}%){reeval_indicator}" 

346 ) 

347 

348 def _llm_prescreen_candidate( 

349 self, candidate, constraints, original_query=None 

350 ): 

351 """Simple quality check for answer candidates.""" 

352 

353 if not original_query: 

354 return { 

355 "should_reject": False, 

356 "reason": "No original query provided", 

357 "detailed_results": [], 

358 } 

359 

360 prompt = f"""Question: {original_query} 

361Answer: {candidate.name} 

362 

363Is this a good answer to the question? Rate 0-100 where: 

364- 90-100: Excellent direct answer 

365- 70-89: Good answer 

366- 50-69: Partial answer 

367- 30-49: Weak answer 

368- 0-29: Poor/wrong answer 

369 

370Just give the number:""" 

371 

372 try: 

373 response = self.model.invoke(prompt).content 

374 

375 # Parse confidence score 

376 import re 

377 

378 confidence_match = re.search(r"(\d{1,3})", response.strip()) 

379 

380 if confidence_match: 

381 quality_score = int(confidence_match.group(1)) 

382 

383 # Accept good answers (50+ out of 100) 

384 if quality_score >= 50: 

385 return { 

386 "should_reject": False, 

387 "reason": f"Good answer quality: {quality_score}%", 

388 "detailed_results": [ 

389 { 

390 "constraint": "answer_quality", 

391 "positive_confidence": quality_score / 100.0, 

392 "source": "answer_quality_check", 

393 } 

394 ], 

395 } 

396 return { 

397 "should_reject": True, 

398 "reason": f"Poor answer quality: {quality_score}%", 

399 "detailed_results": [ 

400 { 

401 "constraint": "answer_quality", 

402 "negative_confidence": (100 - quality_score) 

403 / 100.0, 

404 "source": "answer_quality_check", 

405 } 

406 ], 

407 } 

408 

409 # Parsing failed - accept by default 

410 return { 

411 "should_reject": False, 

412 "reason": "Could not parse quality score - accepting", 

413 "detailed_results": [], 

414 } 

415 

416 except Exception: 

417 logger.warning( 

418 f"Fast LLM pre-screening failed for {candidate.name}" 

419 ) 

420 return { 

421 "should_reject": False, 

422 "reason": "", 

423 "detailed_results": [], 

424 }