Coverage for src / local_deep_research / benchmarks / evaluators / browsecomp.py: 39%

18 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2BrowseComp benchmark evaluator. 

3 

4This module provides a benchmark evaluator implementation for the BrowseComp 

5benchmark, which tests browsing comprehension capabilities. 

6""" 

7 

8from typing import Any, Dict 

9 

10from loguru import logger 

11 

12from ..runners import run_browsecomp_benchmark 

13from .base import BaseBenchmarkEvaluator 

14 

15 

16class BrowseCompEvaluator(BaseBenchmarkEvaluator): 

17 """ 

18 Evaluator for the BrowseComp benchmark. 

19 

20 This evaluator runs the BrowseComp benchmark, which tests a system's ability 

21 to accurately comprehend and answer questions from web browsing. 

22 """ 

23 

24 def __init__(self): 

25 """Initialize the BrowseComp evaluator.""" 

26 super().__init__("browsecomp") 

27 

28 def evaluate( 

29 self, 

30 system_config: Dict[str, Any], 

31 num_examples: int, 

32 output_dir: str, 

33 ) -> Dict[str, Any]: 

34 """ 

35 Run BrowseComp benchmark and return metrics. 

36 

37 Args: 

38 system_config: Search and LLM configuration parameters 

39 num_examples: Number of benchmark examples to run 

40 output_dir: Directory to save evaluation results 

41 

42 Returns: 

43 Dictionary with metrics including quality_score based on accuracy 

44 """ 

45 # Create benchmark-specific directory 

46 benchmark_dir = self._create_subdirectory(output_dir) 

47 

48 # Log benchmark execution 

49 logger.info( 

50 f"Running BrowseComp benchmark with {num_examples} examples" 

51 ) 

52 

53 try: 

54 # Run BrowseComp benchmark 

55 results = run_browsecomp_benchmark( 

56 num_examples=num_examples, 

57 output_dir=benchmark_dir, 

58 search_config=system_config, 

59 run_evaluation=True, 

60 ) 

61 

62 # Extract metrics 

63 metrics = results.get("metrics", {}) 

64 accuracy = metrics.get("accuracy", 0.0) 

65 

66 # Return evaluation results with quality score 

67 return { 

68 "benchmark_type": self.name, 

69 "accuracy": accuracy, 

70 "quality_score": accuracy, # Map accuracy directly to quality score 

71 "raw_results": results, 

72 "report_path": results.get("report_path"), 

73 } 

74 

75 except Exception as e: 

76 logger.exception(f"Error in BrowseComp evaluation: {e!s}") 

77 

78 # Return error information 

79 return { 

80 "benchmark_type": self.name, 

81 "error": str(e), 

82 "quality_score": 0.0, 

83 "accuracy": 0.0, 

84 }