Coverage for src / local_deep_research / benchmarks / evaluators / base.py: 60%

15 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Base class for benchmark evaluators. 

3 

4This module defines the abstract base class that all benchmark evaluators 

5must implement, establishing a common interface for different benchmark types. 

6""" 

7 

8from abc import ABC, abstractmethod 

9from pathlib import Path 

10from typing import Any, Dict 

11 

12 

13class BaseBenchmarkEvaluator(ABC): 

14 """ 

15 Abstract base class for benchmark evaluators. 

16 

17 All benchmark evaluator implementations must inherit from this class and 

18 implement the evaluate method to run their specific benchmark type. 

19 """ 

20 

21 def __init__(self, name: str): 

22 """ 

23 Initialize benchmark evaluator with a name. 

24 

25 Args: 

26 name: Unique identifier for this benchmark type 

27 """ 

28 self.name = name 

29 

30 def get_name(self) -> str: 

31 """ 

32 Get the benchmark name. 

33 

34 Returns: 

35 The benchmark identifier 

36 """ 

37 return self.name 

38 

39 @abstractmethod 

40 def evaluate( 

41 self, 

42 system_config: Dict[str, Any], 

43 num_examples: int, 

44 output_dir: str, 

45 ) -> Dict[str, Any]: 

46 """ 

47 Run benchmark evaluation with given system configuration. 

48 

49 Args: 

50 system_config: Configuration parameters for the system under test 

51 num_examples: Number of benchmark examples to evaluate 

52 output_dir: Directory to save evaluation results 

53 

54 Returns: 

55 Dictionary with evaluation metrics including quality_score (0-1) 

56 """ 

57 pass 

58 

59 def _create_subdirectory(self, output_dir: str) -> str: 

60 """ 

61 Create a benchmark-specific subdirectory for output. 

62 

63 Args: 

64 output_dir: Parent directory for output 

65 

66 Returns: 

67 Path to the benchmark-specific directory 

68 """ 

69 benchmark_dir = Path(output_dir) / self.name 

70 benchmark_dir.mkdir(parents=True, exist_ok=True) 

71 return str(benchmark_dir)