Coverage for src/local_deep_research/benchmarks/evaluators/browsecomp.py: 100%

1"""

2BrowseComp benchmark evaluator.

4This module provides a benchmark evaluator implementation for the BrowseComp

5benchmark, which tests browsing comprehension capabilities.

6"""

8from typing import Any, Dict

10from loguru import logger

12from ..runners import run_browsecomp_benchmark

13from .base import BaseBenchmarkEvaluator

16class BrowseCompEvaluator(BaseBenchmarkEvaluator):

17 """

18 Evaluator for the BrowseComp benchmark.

20 This evaluator runs the BrowseComp benchmark, which tests a system's ability

21 to accurately comprehend and answer questions from web browsing.

22 """

24 def __init__(self):

25 """Initialize the BrowseComp evaluator."""

26 super().__init__("browsecomp")

28 def evaluate(

29 self,

30 system_config: Dict[str, Any],

31 num_examples: int,

32 output_dir: str,

33 ) -> Dict[str, Any]:

34 """

35 Run BrowseComp benchmark and return metrics.

37 Args:

38 system_config: Search and LLM configuration parameters

39 num_examples: Number of benchmark examples to run

40 output_dir: Directory to save evaluation results

42 Returns:

43 Dictionary with metrics including quality_score based on accuracy

44 """

45 # Create benchmark-specific directory

46 benchmark_dir = self._create_subdirectory(output_dir)

48 # Log benchmark execution

49 logger.info(

50 f"Running BrowseComp benchmark with {num_examples} examples"

51 )

53 try:

54 # Run BrowseComp benchmark

55 results = run_browsecomp_benchmark(

56 num_examples=num_examples,

57 output_dir=benchmark_dir,

58 search_config=system_config,

59 run_evaluation=True,

60 )

62 # Extract metrics

63 metrics = results.get("metrics", {})

64 accuracy = metrics.get("accuracy", 0.0)

66 # Return evaluation results with quality score

67 return {

68 "benchmark_type": self.name,

69 "accuracy": accuracy,

70 "quality_score": accuracy, # Map accuracy directly to quality score

71 "raw_results": results,

72 "report_path": results.get("report_path"),

73 }

75 except Exception as e:

76 logger.exception("Error in BrowseComp evaluation")

78 # Return error information

79 return {

80 "benchmark_type": self.name,

81 "error": str(e),

82 "quality_score": 0.0,

83 "accuracy": 0.0,

84 }

Coverage for src / local_deep_research / benchmarks / evaluators / browsecomp.py: 100%

18 statements