Coverage for src / local_deep_research / benchmarks / evaluators / composite.py: 19%
31 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Composite benchmark evaluator.
4This module provides a composite evaluator that can run multiple benchmarks
5with weighted scores to provide a comprehensive evaluation.
6"""
8from typing import Any, Dict, Optional
10from loguru import logger
12# Import specific evaluator implementations
13from .browsecomp import BrowseCompEvaluator
14from .simpleqa import SimpleQAEvaluator
17class CompositeBenchmarkEvaluator:
18 """
19 Evaluator that combines multiple benchmarks with weighted scores.
21 This evaluator runs multiple benchmark types and combines their scores
22 according to specified weights, enabling comprehensive evaluation across
23 different metrics and tasks.
24 """
26 def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):
27 """
28 Initialize with benchmark weights.
30 Args:
31 benchmark_weights: Dictionary mapping benchmark names to weights
32 Default: {"simpleqa": 1.0}
33 """
34 # Default to SimpleQA only if no weights provided
35 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
37 # Create evaluators for available benchmarks
38 self.evaluators = {
39 "simpleqa": SimpleQAEvaluator(),
40 "browsecomp": BrowseCompEvaluator(),
41 }
43 # Normalize weights to sum to 1.0
44 total_weight = sum(self.benchmark_weights.values())
45 if total_weight <= 0:
46 logger.warning(
47 "Total benchmark weight is zero or negative. Using default weights."
48 )
49 self.normalized_weights = {"simpleqa": 1.0}
50 else:
51 self.normalized_weights = {
52 k: w / total_weight for k, w in self.benchmark_weights.items()
53 }
55 # Log the weights being used
56 logger.info(
57 f"Using normalized benchmark weights: {self.normalized_weights}"
58 )
60 def evaluate(
61 self,
62 system_config: Dict[str, Any],
63 num_examples: int,
64 output_dir: str,
65 ) -> Dict[str, Any]:
66 """
67 Run all requested benchmarks and compute weighted score.
69 Args:
70 system_config: Configuration parameters for the system under test
71 num_examples: Number of benchmark examples to evaluate
72 output_dir: Directory to save evaluation results
74 Returns:
75 Dictionary with combined metrics and individual benchmark results
76 """
77 all_results = {}
78 combined_score = 0.0
80 # Run each benchmark with weight > 0
81 for benchmark_name, weight in self.normalized_weights.items():
82 if weight > 0 and benchmark_name in self.evaluators:
83 evaluator = self.evaluators[benchmark_name]
85 try:
86 # Run benchmark evaluation
87 result = evaluator.evaluate(
88 system_config=system_config,
89 num_examples=num_examples,
90 output_dir=output_dir,
91 )
93 # Store individual results
94 all_results[benchmark_name] = result
96 # Calculate weighted contribution to combined score
97 quality_score = result.get("quality_score", 0.0)
98 weighted_contribution = quality_score * weight
100 logger.info(
101 f"Benchmark {benchmark_name}: score={quality_score:.4f}, "
102 f"weight={weight:.2f}, contribution={weighted_contribution:.4f}"
103 )
105 # Add to combined score
106 combined_score += weighted_contribution
108 except Exception as e:
109 logger.exception(
110 f"Error running {benchmark_name} benchmark: {e!s}"
111 )
112 all_results[benchmark_name] = {
113 "benchmark_type": benchmark_name,
114 "error": str(e),
115 "quality_score": 0.0,
116 }
118 # Return combined results
119 return {
120 "quality_score": combined_score,
121 "benchmark_results": all_results,
122 "benchmark_weights": self.normalized_weights,
123 "combined_score": combined_score,
124 }