Coverage for src / local_deep_research / benchmarks / evaluators / composite.py: 19%

31 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Composite benchmark evaluator. 

3 

4This module provides a composite evaluator that can run multiple benchmarks 

5with weighted scores to provide a comprehensive evaluation. 

6""" 

7 

8from typing import Any, Dict, Optional 

9 

10from loguru import logger 

11 

12# Import specific evaluator implementations 

13from .browsecomp import BrowseCompEvaluator 

14from .simpleqa import SimpleQAEvaluator 

15 

16 

17class CompositeBenchmarkEvaluator: 

18 """ 

19 Evaluator that combines multiple benchmarks with weighted scores. 

20 

21 This evaluator runs multiple benchmark types and combines their scores 

22 according to specified weights, enabling comprehensive evaluation across 

23 different metrics and tasks. 

24 """ 

25 

26 def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None): 

27 """ 

28 Initialize with benchmark weights. 

29 

30 Args: 

31 benchmark_weights: Dictionary mapping benchmark names to weights 

32 Default: {"simpleqa": 1.0} 

33 """ 

34 # Default to SimpleQA only if no weights provided 

35 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0} 

36 

37 # Create evaluators for available benchmarks 

38 self.evaluators = { 

39 "simpleqa": SimpleQAEvaluator(), 

40 "browsecomp": BrowseCompEvaluator(), 

41 } 

42 

43 # Normalize weights to sum to 1.0 

44 total_weight = sum(self.benchmark_weights.values()) 

45 if total_weight <= 0: 

46 logger.warning( 

47 "Total benchmark weight is zero or negative. Using default weights." 

48 ) 

49 self.normalized_weights = {"simpleqa": 1.0} 

50 else: 

51 self.normalized_weights = { 

52 k: w / total_weight for k, w in self.benchmark_weights.items() 

53 } 

54 

55 # Log the weights being used 

56 logger.info( 

57 f"Using normalized benchmark weights: {self.normalized_weights}" 

58 ) 

59 

60 def evaluate( 

61 self, 

62 system_config: Dict[str, Any], 

63 num_examples: int, 

64 output_dir: str, 

65 ) -> Dict[str, Any]: 

66 """ 

67 Run all requested benchmarks and compute weighted score. 

68 

69 Args: 

70 system_config: Configuration parameters for the system under test 

71 num_examples: Number of benchmark examples to evaluate 

72 output_dir: Directory to save evaluation results 

73 

74 Returns: 

75 Dictionary with combined metrics and individual benchmark results 

76 """ 

77 all_results = {} 

78 combined_score = 0.0 

79 

80 # Run each benchmark with weight > 0 

81 for benchmark_name, weight in self.normalized_weights.items(): 

82 if weight > 0 and benchmark_name in self.evaluators: 

83 evaluator = self.evaluators[benchmark_name] 

84 

85 try: 

86 # Run benchmark evaluation 

87 result = evaluator.evaluate( 

88 system_config=system_config, 

89 num_examples=num_examples, 

90 output_dir=output_dir, 

91 ) 

92 

93 # Store individual results 

94 all_results[benchmark_name] = result 

95 

96 # Calculate weighted contribution to combined score 

97 quality_score = result.get("quality_score", 0.0) 

98 weighted_contribution = quality_score * weight 

99 

100 logger.info( 

101 f"Benchmark {benchmark_name}: score={quality_score:.4f}, " 

102 f"weight={weight:.2f}, contribution={weighted_contribution:.4f}" 

103 ) 

104 

105 # Add to combined score 

106 combined_score += weighted_contribution 

107 

108 except Exception as e: 

109 logger.exception( 

110 f"Error running {benchmark_name} benchmark: {e!s}" 

111 ) 

112 all_results[benchmark_name] = { 

113 "benchmark_type": benchmark_name, 

114 "error": str(e), 

115 "quality_score": 0.0, 

116 } 

117 

118 # Return combined results 

119 return { 

120 "quality_score": combined_score, 

121 "benchmark_results": all_results, 

122 "benchmark_weights": self.normalized_weights, 

123 "combined_score": combined_score, 

124 }