Coverage for src/local_deep_research/advanced_search_system/constraint_checking/dual_confidence

1"""

2Dual confidence constraint checker implementation.

4This implementation uses dual confidence scoring (positive/negative/uncertainty)

5to evaluate constraints and make rejection decisions.

6"""

8from typing import Dict, List, Tuple

10from loguru import logger

12from ..candidates.base_candidate import Candidate

13from ..constraints.base_constraint import Constraint

14from .base_constraint_checker import (

15 BaseConstraintChecker,

16 ConstraintCheckResult,

17)

18from .evidence_analyzer import ConstraintEvidence, EvidenceAnalyzer

21class DualConfidenceChecker(BaseConstraintChecker):

22 """

23 Constraint checker using dual confidence scoring.

25 This checker:

26 1. Analyzes evidence using positive/negative/uncertainty scores

27 2. Makes rejection decisions based on confidence thresholds

28 3. Provides detailed scoring breakdown

29 """

31 def __init__(

32 self,

33 *args,

34 negative_threshold: float = 0.25, # Reject if negative evidence > 25%

35 positive_threshold: float = 0.4, # Reject if positive evidence < 40%

36 uncertainty_penalty: float = 0.2,

37 negative_weight: float = 0.5,

38 uncertainty_threshold: float = 0.6, # Re-evaluate if uncertainty > 60%

39 max_reevaluations: int = 2, # Maximum re-evaluation rounds

40 **kwargs,

41 ):

42 """

43 Initialize dual confidence checker.

45 Args:

46 negative_threshold: Threshold for negative evidence rejection

47 positive_threshold: Minimum positive evidence required

48 uncertainty_penalty: Penalty for uncertain evidence

49 negative_weight: Weight for negative evidence in scoring

50 uncertainty_threshold: Re-evaluate if uncertainty exceeds this

51 max_reevaluations: Maximum number of re-evaluation rounds

52 """

53 super().__init__(*args, **kwargs)

55 self.negative_threshold = negative_threshold

56 self.positive_threshold = positive_threshold

57 self.uncertainty_penalty = uncertainty_penalty

58 self.negative_weight = negative_weight

59 self.uncertainty_threshold = uncertainty_threshold

60 self.max_reevaluations = max_reevaluations

62 # Initialize evidence analyzer

63 self.evidence_analyzer = EvidenceAnalyzer(self.model)

65 def check_candidate(

66 self,

67 candidate: Candidate,

68 constraints: List[Constraint],

69 original_query: str = None,

70 ) -> ConstraintCheckResult:

71 """Check candidate using dual confidence analysis with LLM pre-screening."""

72 logger.info(f"Checking candidate: {candidate.name} (dual confidence)")

74 # LLM PRE-SCREENING: Check all constraints in one call to save SearXNG capacity

75 pre_screen_result = self._llm_prescreen_candidate(

76 candidate, constraints, original_query

77 )

78 if pre_screen_result["should_reject"]: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 logger.info(

80 f"🚫 LLM pre-screen rejected {candidate.name}: {pre_screen_result['reason']}"

81 )

82 return ConstraintCheckResult(

83 should_reject=True,

84 rejection_reason=pre_screen_result["reason"],

85 total_score=0.0,

86 detailed_results=pre_screen_result["detailed_results"],

87 )

89 constraint_scores = {}

90 detailed_results = []

91 rejection_reason = None

92 should_reject = False

94 for constraint in constraints:

95 # Perform initial evaluation with re-evaluation for uncertain constraints

96 result = self._evaluate_constraint_with_reevaluation(

97 candidate, constraint

98 )

100 avg_positive = result["positive"]

101 avg_negative = result["negative"]

102 avg_uncertainty = result["uncertainty"]

103 score = result["score"]

104 reevaluation_count = result.get("reevaluation_count", 0)

105

106 # Check for rejection based on final results

107 reject, reason = self.should_reject_candidate_from_averages(

108 candidate, constraint, avg_positive, avg_negative

109 )

110

111 if reject and not should_reject: # Only record first rejection

112 should_reject = True

113 rejection_reason = reason

114

115 # Store results

116 constraint_scores[constraint.value] = {

117 "total": score,

118 "positive": avg_positive,

119 "negative": avg_negative,

120 "uncertainty": avg_uncertainty,

121 "weight": constraint.weight,

122 "reevaluation_count": reevaluation_count,

123 }

124

125 detailed_results.append(

126 {

127 "constraint": constraint.value,

128 "score": score,

129 "positive": avg_positive,

130 "negative": avg_negative,

131 "uncertainty": avg_uncertainty,

132 "weight": constraint.weight,

133 "type": constraint.type.value,

134 "reevaluation_count": reevaluation_count,

135 }

136 )

137

138 # Log detailed result with re-evaluation info

139 self._log_constraint_result_detailed(

140 candidate,

141 constraint,

142 score,

143 avg_positive,

144 avg_negative,

145 avg_uncertainty,

146 reevaluation_count,

147 )

148

149 # Calculate total score

150 if should_reject:

151 total_score = 0.0

152 else:

153 if detailed_results:

154 weights = [r["weight"] for r in detailed_results]

155 scores = [r["score"] for r in detailed_results]

156 total_score = self._calculate_weighted_score(scores, weights)

157 else:

158 total_score = 0.0

159

160 logger.info(f"Final score for {candidate.name}: {total_score:.2%}")

161

162 return ConstraintCheckResult(

163 candidate=candidate,

164 total_score=total_score,

165 constraint_scores=constraint_scores,

166 should_reject=should_reject,

167 rejection_reason=rejection_reason,

168 detailed_results=detailed_results,

169 )

170

171 def _evaluate_constraint_with_reevaluation(

172 self, candidate: Candidate, constraint: Constraint

173 ) -> Dict:

174 """Evaluate constraint with potential re-evaluation for uncertain results."""

175 reevaluation_count = 0

176 evidence_list = []

177

178 while reevaluation_count <= self.max_reevaluations: 178 ↛ 252line 178 didn't jump to line 252 because the condition on line 178 was always true

179 # Gather evidence (fresh each time for re-evaluation)

180 evidence_list = self._gather_evidence_for_constraint(

181 candidate, constraint

182 )

183

184 if not evidence_list:

185 # No evidence found

186 return {

187 "positive": 0.0,

188 "negative": 0.0,

189 "uncertainty": 1.0,

190 "score": 0.5 - self.uncertainty_penalty,

191 "evidence_list": [],

192 "reevaluation_count": reevaluation_count,

193 }

194

195 # Analyze with dual confidence

196 dual_evidence = [

197 self.evidence_analyzer.analyze_evidence_dual_confidence(

198 e, constraint

199 )

200 for e in evidence_list

201 ]

202

203 # Calculate averages

204 avg_positive = sum(

205 e.positive_confidence for e in dual_evidence

206 ) / len(dual_evidence)

207 avg_negative = sum(

208 e.negative_confidence for e in dual_evidence

209 ) / len(dual_evidence)

210 avg_uncertainty = sum(e.uncertainty for e in dual_evidence) / len(

211 dual_evidence

212 )

213

214 # Calculate score

215 score = self.evidence_analyzer.evaluate_evidence_list(

216 evidence_list,

217 constraint,

218 self.uncertainty_penalty,

219 self.negative_weight,

220 )

221

222 # Check if we need re-evaluation

223 if ( 223 ↛ 228line 223 didn't jump to line 228 because the condition on line 223 was never true

224 reevaluation_count < self.max_reevaluations

225 and avg_uncertainty > self.uncertainty_threshold

226 and not self._should_early_reject(avg_positive, avg_negative)

227 ):

228 reevaluation_count += 1

229 logger.info(

230 f"🔄 Re-evaluating {candidate.name} | {constraint.value} "

231 f"(round {reevaluation_count}) - high uncertainty: {avg_uncertainty:.0%}"

232 )

233 continue

234 else:

235 # Final result or early rejection

236 if reevaluation_count > 0: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 logger.info(

238 f"✅ Final evaluation for {candidate.name} | {constraint.value} "

239 f"after {reevaluation_count} re-evaluation(s)"

240 )

241

242 return {

243 "positive": avg_positive,

244 "negative": avg_negative,

245 "uncertainty": avg_uncertainty,

246 "score": score,

247 "evidence_list": evidence_list,

248 "reevaluation_count": reevaluation_count,

249 }

250

251 # Should not reach here, but fallback

252 return {

253 "positive": avg_positive,

254 "negative": avg_negative,

255 "uncertainty": avg_uncertainty,

256 "score": score,

257 "evidence_list": evidence_list,

258 "reevaluation_count": reevaluation_count,

259 }

260

261 def _should_early_reject(

262 self, avg_positive: float, avg_negative: float

263 ) -> bool:

264 """Check if candidate should be rejected early (before re-evaluation)."""

265 return (

266 avg_negative > self.negative_threshold

267 or avg_positive < self.positive_threshold

268 )

269

270 def should_reject_candidate_from_averages(

271 self,

272 candidate: Candidate,

273 constraint: Constraint,

274 avg_positive: float,

275 avg_negative: float,

276 ) -> Tuple[bool, str]:

277 """Determine rejection based on average confidence scores."""

278 # PRIMARY REJECTION: High negative evidence

279 if avg_negative > self.negative_threshold:

280 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"

281 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")

282 return True, reason

283

284 # SECONDARY REJECTION: Low positive evidence

285 if avg_positive < self.positive_threshold:

286 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"

287 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")

288 return True, reason

289

290 return False, ""

291

292 def should_reject_candidate(

293 self,

294 candidate: Candidate,

295 constraint: Constraint,

296 dual_evidence: List[ConstraintEvidence],

297 ) -> Tuple[bool, str]:

298 """Determine rejection based on dual confidence scores."""

299 if not dual_evidence:

300 return False, ""

301

302 # Calculate averages

303 avg_positive = sum(e.positive_confidence for e in dual_evidence) / len(

304 dual_evidence

305 )

306 avg_negative = sum(e.negative_confidence for e in dual_evidence) / len(

307 dual_evidence

308 )

309

310 # PRIMARY REJECTION: High negative evidence

311 if avg_negative > self.negative_threshold: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 reason = f"High negative evidence ({avg_negative:.0%}) for constraint '{constraint.value}'"

313 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")

314 return True, reason

315

316 # SECONDARY REJECTION: Low positive evidence

317 if avg_positive < self.positive_threshold: 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 reason = f"Insufficient positive evidence ({avg_positive:.0%}) for constraint '{constraint.value}'"

319 logger.info(f"❌ REJECTION: {candidate.name} - {reason}")

320 return True, reason

321

322 return False, ""

323

324 def _log_constraint_result_detailed(

325 self,

326 candidate,

327 constraint,

328 score,

329 positive,

330 negative,

331 uncertainty,

332 reevaluation_count=0,

333 ):

334 """Log detailed constraint result."""

335 symbol = "✓" if score >= 0.8 else "○" if score >= 0.5 else "✗"

336

337 # Add re-evaluation indicator

338 reeval_indicator = (

339 f" [R{reevaluation_count}]" if reevaluation_count > 0 else ""

340 )

341

342 logger.info(

343 f"{symbol} {candidate.name} | {constraint.value}: {int(score * 100)}% "

344 f"(+{int(positive * 100)}% -{int(negative * 100)}% ?{int(uncertainty * 100)}%){reeval_indicator}"

345 )

346

347 def _llm_prescreen_candidate(

348 self, candidate, constraints, original_query=None

349 ):

350 """Simple quality check for answer candidates."""

351

352 if not original_query:

353 return {

354 "should_reject": False,

355 "reason": "No original query provided",

356 "detailed_results": [],

357 }

358

359 prompt = f"""Question: {original_query}

360Answer: {candidate.name}

361

362Is this a good answer to the question? Rate 0-100 where:

363- 90-100: Excellent direct answer

364- 70-89: Good answer

365- 50-69: Partial answer

366- 30-49: Weak answer

367- 0-29: Poor/wrong answer

368

369Just give the number:"""

370

371 try:

372 response = self.model.generate(prompt)

373

374 # Parse confidence score

375 import re

376

377 confidence_match = re.search(r"(\d{1,3})", response.strip())

378

379 if confidence_match:

380 quality_score = int(confidence_match.group(1))

381

382 # Accept good answers (50+ out of 100)

383 if quality_score >= 50:

384 return {

385 "should_reject": False,

386 "reason": f"Good answer quality: {quality_score}%",

387 "detailed_results": [

388 {

389 "constraint": "answer_quality",

390 "positive_confidence": quality_score / 100.0,

391 "source": "answer_quality_check",

392 }

393 ],

394 }

395 else:

396 return {

397 "should_reject": True,

398 "reason": f"Poor answer quality: {quality_score}%",

399 "detailed_results": [

400 {

401 "constraint": "answer_quality",

402 "negative_confidence": (100 - quality_score)

403 / 100.0,

404 "source": "answer_quality_check",

405 }

406 ],

407 }

408

409 # Parsing failed - accept by default

410 return {

411 "should_reject": False,

412 "reason": "Could not parse quality score - accepting",

413 "detailed_results": [],

414 }

415

416 except Exception as e:

417 logger.warning(

418 f"Fast LLM pre-screening failed for {candidate.name}: {e}"

419 )

420 return {

421 "should_reject": False,

422 "reason": "",

423 "detailed_results": [],

424 }

Coverage for src / local_deep_research / advanced_search_system / constraint_checking / dual_confidence_checker.py: 87%

116 statements