Coverage for src / local_deep_research / benchmarks / evaluators / browsecomp.py: 39%
18 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2BrowseComp benchmark evaluator.
4This module provides a benchmark evaluator implementation for the BrowseComp
5benchmark, which tests browsing comprehension capabilities.
6"""
8from typing import Any, Dict
10from loguru import logger
12from ..runners import run_browsecomp_benchmark
13from .base import BaseBenchmarkEvaluator
16class BrowseCompEvaluator(BaseBenchmarkEvaluator):
17 """
18 Evaluator for the BrowseComp benchmark.
20 This evaluator runs the BrowseComp benchmark, which tests a system's ability
21 to accurately comprehend and answer questions from web browsing.
22 """
24 def __init__(self):
25 """Initialize the BrowseComp evaluator."""
26 super().__init__("browsecomp")
28 def evaluate(
29 self,
30 system_config: Dict[str, Any],
31 num_examples: int,
32 output_dir: str,
33 ) -> Dict[str, Any]:
34 """
35 Run BrowseComp benchmark and return metrics.
37 Args:
38 system_config: Search and LLM configuration parameters
39 num_examples: Number of benchmark examples to run
40 output_dir: Directory to save evaluation results
42 Returns:
43 Dictionary with metrics including quality_score based on accuracy
44 """
45 # Create benchmark-specific directory
46 benchmark_dir = self._create_subdirectory(output_dir)
48 # Log benchmark execution
49 logger.info(
50 f"Running BrowseComp benchmark with {num_examples} examples"
51 )
53 try:
54 # Run BrowseComp benchmark
55 results = run_browsecomp_benchmark(
56 num_examples=num_examples,
57 output_dir=benchmark_dir,
58 search_config=system_config,
59 run_evaluation=True,
60 )
62 # Extract metrics
63 metrics = results.get("metrics", {})
64 accuracy = metrics.get("accuracy", 0.0)
66 # Return evaluation results with quality score
67 return {
68 "benchmark_type": self.name,
69 "accuracy": accuracy,
70 "quality_score": accuracy, # Map accuracy directly to quality score
71 "raw_results": results,
72 "report_path": results.get("report_path"),
73 }
75 except Exception as e:
76 logger.exception(f"Error in BrowseComp evaluation: {e!s}")
78 # Return error information
79 return {
80 "benchmark_type": self.name,
81 "error": str(e),
82 "quality_score": 0.0,
83 "accuracy": 0.0,
84 }