Coverage for src / local_deep_research / benchmarks / evaluators / simpleqa.py: 18%
69 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2SimpleQA benchmark evaluator.
4This module provides a benchmark evaluator implementation for the SimpleQA
5benchmark, which tests simple question-answering capabilities.
6"""
8import json
9from loguru import logger
10from pathlib import Path
11import time
12from typing import Any, Dict
14from ..datasets.base import DatasetRegistry
15from ..metrics import calculate_metrics, generate_report
16from ..runners import run_simpleqa_benchmark # Keep for backward compatibility
17from .base import BaseBenchmarkEvaluator
20class SimpleQAEvaluator(BaseBenchmarkEvaluator):
21 """
22 Evaluator for the SimpleQA benchmark.
24 This evaluator runs the SimpleQA benchmark, which tests a system's ability
25 to accurately answer straightforward factual questions.
26 """
28 def __init__(self):
29 """Initialize the SimpleQA evaluator."""
30 super().__init__("simpleqa")
32 def evaluate(
33 self,
34 system_config: Dict[str, Any],
35 num_examples: int,
36 output_dir: str,
37 use_direct_dataset: bool = True,
38 ) -> Dict[str, Any]:
39 """
40 Run SimpleQA benchmark and return metrics.
42 Args:
43 system_config: Search and LLM configuration parameters
44 num_examples: Number of benchmark examples to run
45 output_dir: Directory to save evaluation results
46 use_direct_dataset: Whether to use dataset classes directly (recommended)
47 or fall back to runner functions
49 Returns:
50 Dictionary with metrics including quality_score based on accuracy
51 """
52 # Create benchmark-specific directory
53 benchmark_dir = self._create_subdirectory(output_dir)
55 # Log benchmark execution
56 logger.info(f"Running SimpleQA benchmark with {num_examples} examples")
58 try:
59 if use_direct_dataset:
60 # Use dataset classes directly (new approach)
61 results = self._run_with_dataset_class(
62 system_config=system_config,
63 num_examples=num_examples,
64 output_dir=benchmark_dir,
65 )
66 else:
67 # Fall back to legacy runner function
68 results = run_simpleqa_benchmark(
69 num_examples=num_examples,
70 output_dir=benchmark_dir,
71 search_config=system_config,
72 run_evaluation=True,
73 )
75 # Extract metrics
76 metrics = results.get("metrics", {})
77 accuracy = metrics.get("accuracy", 0.0)
79 # Return evaluation results with quality score
80 return {
81 "benchmark_type": self.name,
82 "accuracy": accuracy,
83 "quality_score": accuracy, # Map accuracy directly to quality score
84 "raw_results": results,
85 "report_path": results.get("report_path"),
86 }
88 except Exception as e:
89 logger.exception(f"Error in SimpleQA evaluation: {e!s}")
91 # Return error information
92 return {
93 "benchmark_type": self.name,
94 "error": str(e),
95 "quality_score": 0.0,
96 "accuracy": 0.0,
97 }
99 def _run_with_dataset_class(
100 self,
101 system_config: Dict[str, Any],
102 num_examples: int,
103 output_dir: str,
104 ) -> Dict[str, Any]:
105 """
106 Run SimpleQA benchmark using dataset classes directly.
108 This implementation directly uses the dataset classes rather than
109 going through the runner functions, allowing for more flexibility
110 and better integration with the object-oriented architecture.
112 Args:
113 system_config: Search and LLM configuration parameters
114 num_examples: Number of benchmark examples to run
115 output_dir: Directory to save evaluation results
117 Returns:
118 Dictionary with benchmark results
119 """
120 # Create a dataset instance using the registry
121 try:
122 dataset_instance = DatasetRegistry.create_dataset(
123 dataset_id="simpleqa",
124 num_examples=num_examples,
125 seed=system_config.get("seed", 42),
126 )
128 # Load dataset examples
129 examples = dataset_instance.load()
130 logger.info(f"Loaded {len(examples)} SimpleQA examples")
132 # Set up output files
133 timestamp = time.strftime("%Y%m%d_%H%M%S")
134 results_file = str(
135 Path(output_dir) / f"simpleqa_{timestamp}_results.jsonl"
136 )
137 evaluation_file = str(
138 Path(output_dir) / f"simpleqa_{timestamp}_evaluation.jsonl"
139 )
140 report_file = str(
141 Path(output_dir) / f"simpleqa_{timestamp}_report.md"
142 )
144 # Process each example
145 results = []
147 for i, example in enumerate(examples):
148 # Extract question and answer using dataset methods
149 question = dataset_instance.get_question(example)
150 correct_answer = dataset_instance.get_answer(example)
152 logger.info(
153 f"Processing {i + 1}/{len(examples)}: {question[:50]}..."
154 )
156 try:
157 # Format query based on dataset type
158 formatted_query = question # Simple format for SimpleQA
160 # Time the search
161 start_time = time.time()
163 # Create search config from system_config
164 search_params = {
165 "iterations": system_config.get("iterations", 3),
166 "questions_per_iteration": system_config.get(
167 "questions_per_iteration", 3
168 ),
169 "search_tool": system_config.get(
170 "search_tool", "searxng"
171 ),
172 # Note: search_strategy is stored in the config but not passed to quick_summary
173 # as it's not supported by the underlying API
174 }
176 # Get response from LDR
177 from local_deep_research.api import quick_summary
179 search_result = quick_summary(
180 query=formatted_query,
181 iterations=search_params.get("iterations"),
182 questions_per_iteration=search_params.get(
183 "questions_per_iteration"
184 ),
185 search_tool=search_params.get("search_tool"),
186 )
188 end_time = time.time()
189 processing_time = end_time - start_time
191 # Extract response
192 response = search_result.get("summary", "")
194 # Extract structured answer
195 from ..graders import extract_answer_from_response
197 extracted = extract_answer_from_response(
198 response, "simpleqa"
199 )
201 # Format result
202 result = {
203 "id": example.get("id", f"example_{i}"),
204 "problem": question,
205 "correct_answer": correct_answer,
206 "response": response,
207 "extracted_answer": extracted["extracted_answer"],
208 "confidence": extracted["confidence"],
209 "processing_time": processing_time,
210 "sources": search_result.get("sources", []),
211 "search_config": search_params,
212 }
214 # Add to results list
215 results.append(result)
217 # Write result to file
218 with open(results_file, "a") as f:
219 f.write(json.dumps(result) + "\n")
221 except Exception as e:
222 logger.exception(f"Error processing example {i + 1}: {e!s}")
224 # Create error result
225 error_result = {
226 "id": example.get("id", f"example_{i}"),
227 "problem": question,
228 "correct_answer": correct_answer,
229 "error": str(e),
230 "processing_time": 0,
231 }
233 # Add to results list
234 results.append(error_result)
236 # Write error result to file
237 with open(results_file, "a") as f:
238 f.write(json.dumps(error_result) + "\n")
240 # Grade results
241 from ..graders import grade_results
243 grade_results(
244 results_file=results_file,
245 output_file=evaluation_file,
246 dataset_type="simpleqa",
247 )
249 # Calculate metrics
250 metrics = calculate_metrics(evaluation_file)
252 # Generate report
253 dataset_name = "SimpleQA"
254 report_path = generate_report(
255 metrics=metrics,
256 results_file=evaluation_file,
257 output_file=report_file,
258 dataset_name=dataset_name,
259 config_info={
260 "Dataset": "SimpleQA",
261 "Examples": len(examples),
262 "Iterations": search_params.get("iterations", 3),
263 "Questions per iteration": search_params.get(
264 "questions_per_iteration", 3
265 ),
266 "Search tool": search_params.get("search_tool", "searxng"),
267 "Search strategy": search_params.get(
268 "search_strategy", "source_based"
269 ),
270 },
271 )
273 # Return results
274 return {
275 "status": "complete",
276 "dataset_type": "simpleqa",
277 "results_path": results_file,
278 "evaluation_path": evaluation_file,
279 "report_path": report_path,
280 "metrics": metrics,
281 "total_examples": len(examples),
282 "accuracy": metrics.get("accuracy", 0),
283 }
285 except Exception as e:
286 logger.exception(f"Error in direct dataset evaluation: {e!s}")
287 return {
288 "status": "error",
289 "dataset_type": "simpleqa",
290 "error": str(e),
291 }