Coverage for src/local_deep_research/benchmarks/evaluators/composite.py: 100%

1"""

2Composite benchmark evaluator.

4This module provides a composite evaluator that can run multiple benchmarks

5with weighted scores to provide a comprehensive evaluation.

6"""

8from typing import Any, Dict, Optional

10from loguru import logger

12# Import specific evaluator implementations

13from .browsecomp import BrowseCompEvaluator

14from .simpleqa import SimpleQAEvaluator

17class CompositeBenchmarkEvaluator:

18 """

19 Evaluator that combines multiple benchmarks with weighted scores.

21 This evaluator runs multiple benchmark types and combines their scores

22 according to specified weights, enabling comprehensive evaluation across

23 different metrics and tasks.

24 """

26 def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):

27 """

28 Initialize with benchmark weights.

30 Args:

31 benchmark_weights: Dictionary mapping benchmark names to weights

32 Default: {"simpleqa": 1.0}

33 """

34 # Default to SimpleQA only if no weights provided

35 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}

37 # Create evaluators for available benchmarks

38 self.evaluators = {

39 "simpleqa": SimpleQAEvaluator(),

40 "browsecomp": BrowseCompEvaluator(),

41 }

43 # Normalize weights to sum to 1.0

44 total_weight = sum(self.benchmark_weights.values())

45 if total_weight <= 0:

46 logger.warning(

47 "Total benchmark weight is zero or negative. Using default weights."

48 )

49 self.normalized_weights = {"simpleqa": 1.0}

50 else:

51 self.normalized_weights = {

52 k: w / total_weight for k, w in self.benchmark_weights.items()

53 }

55 # Log the weights being used

56 logger.info(

57 f"Using normalized benchmark weights: {self.normalized_weights}"

58 )

60 def evaluate(

61 self,

62 system_config: Dict[str, Any],

63 num_examples: int,

64 output_dir: str,

65 ) -> Dict[str, Any]:

66 """

67 Run all requested benchmarks and compute weighted score.

69 Args:

70 system_config: Configuration parameters for the system under test

71 num_examples: Number of benchmark examples to evaluate

72 output_dir: Directory to save evaluation results

74 Returns:

75 Dictionary with combined metrics and individual benchmark results

76 """

77 all_results = {}

78 combined_score = 0.0

80 # Run each benchmark with weight > 0

81 for benchmark_name, weight in self.normalized_weights.items():

82 if weight > 0 and benchmark_name in self.evaluators:

83 evaluator = self.evaluators[benchmark_name]

85 try:

86 # Run benchmark evaluation

87 result = evaluator.evaluate(

88 system_config=system_config,

89 num_examples=num_examples,

90 output_dir=output_dir,

91 )

93 # Store individual results

94 all_results[benchmark_name] = result

96 # Calculate weighted contribution to combined score

97 quality_score = result.get("quality_score", 0.0)

98 weighted_contribution = quality_score * weight

100 logger.info(

101 f"Benchmark {benchmark_name}: score={quality_score:.4f}, "

102 f"weight={weight:.2f}, contribution={weighted_contribution:.4f}"

103 )

104

105 # Add to combined score

106 combined_score += weighted_contribution

107

108 except Exception as e:

109 logger.exception(

110 f"Error running {benchmark_name} benchmark"

111 )

112 all_results[benchmark_name] = {

113 "benchmark_type": benchmark_name,

114 "error": str(e),

115 "quality_score": 0.0,

116 }

117

118 # Return combined results

119 return {

120 "quality_score": combined_score,

121 "benchmark_results": all_results,

122 "benchmark_weights": self.normalized_weights,

123 "combined_score": combined_score,

124 }

Coverage for src / local_deep_research / benchmarks / evaluators / composite.py: 100%

31 statements