Coverage for src / local_deep_research / benchmarks / evaluators / simpleqa.py: 18%

69 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2SimpleQA benchmark evaluator. 

3 

4This module provides a benchmark evaluator implementation for the SimpleQA 

5benchmark, which tests simple question-answering capabilities. 

6""" 

7 

8import json 

9from loguru import logger 

10from pathlib import Path 

11import time 

12from typing import Any, Dict 

13 

14from ..datasets.base import DatasetRegistry 

15from ..metrics import calculate_metrics, generate_report 

16from ..runners import run_simpleqa_benchmark # Keep for backward compatibility 

17from .base import BaseBenchmarkEvaluator 

18 

19 

20class SimpleQAEvaluator(BaseBenchmarkEvaluator): 

21 """ 

22 Evaluator for the SimpleQA benchmark. 

23 

24 This evaluator runs the SimpleQA benchmark, which tests a system's ability 

25 to accurately answer straightforward factual questions. 

26 """ 

27 

28 def __init__(self): 

29 """Initialize the SimpleQA evaluator.""" 

30 super().__init__("simpleqa") 

31 

32 def evaluate( 

33 self, 

34 system_config: Dict[str, Any], 

35 num_examples: int, 

36 output_dir: str, 

37 use_direct_dataset: bool = True, 

38 ) -> Dict[str, Any]: 

39 """ 

40 Run SimpleQA benchmark and return metrics. 

41 

42 Args: 

43 system_config: Search and LLM configuration parameters 

44 num_examples: Number of benchmark examples to run 

45 output_dir: Directory to save evaluation results 

46 use_direct_dataset: Whether to use dataset classes directly (recommended) 

47 or fall back to runner functions 

48 

49 Returns: 

50 Dictionary with metrics including quality_score based on accuracy 

51 """ 

52 # Create benchmark-specific directory 

53 benchmark_dir = self._create_subdirectory(output_dir) 

54 

55 # Log benchmark execution 

56 logger.info(f"Running SimpleQA benchmark with {num_examples} examples") 

57 

58 try: 

59 if use_direct_dataset: 

60 # Use dataset classes directly (new approach) 

61 results = self._run_with_dataset_class( 

62 system_config=system_config, 

63 num_examples=num_examples, 

64 output_dir=benchmark_dir, 

65 ) 

66 else: 

67 # Fall back to legacy runner function 

68 results = run_simpleqa_benchmark( 

69 num_examples=num_examples, 

70 output_dir=benchmark_dir, 

71 search_config=system_config, 

72 run_evaluation=True, 

73 ) 

74 

75 # Extract metrics 

76 metrics = results.get("metrics", {}) 

77 accuracy = metrics.get("accuracy", 0.0) 

78 

79 # Return evaluation results with quality score 

80 return { 

81 "benchmark_type": self.name, 

82 "accuracy": accuracy, 

83 "quality_score": accuracy, # Map accuracy directly to quality score 

84 "raw_results": results, 

85 "report_path": results.get("report_path"), 

86 } 

87 

88 except Exception as e: 

89 logger.exception(f"Error in SimpleQA evaluation: {e!s}") 

90 

91 # Return error information 

92 return { 

93 "benchmark_type": self.name, 

94 "error": str(e), 

95 "quality_score": 0.0, 

96 "accuracy": 0.0, 

97 } 

98 

99 def _run_with_dataset_class( 

100 self, 

101 system_config: Dict[str, Any], 

102 num_examples: int, 

103 output_dir: str, 

104 ) -> Dict[str, Any]: 

105 """ 

106 Run SimpleQA benchmark using dataset classes directly. 

107 

108 This implementation directly uses the dataset classes rather than 

109 going through the runner functions, allowing for more flexibility 

110 and better integration with the object-oriented architecture. 

111 

112 Args: 

113 system_config: Search and LLM configuration parameters 

114 num_examples: Number of benchmark examples to run 

115 output_dir: Directory to save evaluation results 

116 

117 Returns: 

118 Dictionary with benchmark results 

119 """ 

120 # Create a dataset instance using the registry 

121 try: 

122 dataset_instance = DatasetRegistry.create_dataset( 

123 dataset_id="simpleqa", 

124 num_examples=num_examples, 

125 seed=system_config.get("seed", 42), 

126 ) 

127 

128 # Load dataset examples 

129 examples = dataset_instance.load() 

130 logger.info(f"Loaded {len(examples)} SimpleQA examples") 

131 

132 # Set up output files 

133 timestamp = time.strftime("%Y%m%d_%H%M%S") 

134 results_file = str( 

135 Path(output_dir) / f"simpleqa_{timestamp}_results.jsonl" 

136 ) 

137 evaluation_file = str( 

138 Path(output_dir) / f"simpleqa_{timestamp}_evaluation.jsonl" 

139 ) 

140 report_file = str( 

141 Path(output_dir) / f"simpleqa_{timestamp}_report.md" 

142 ) 

143 

144 # Process each example 

145 results = [] 

146 

147 for i, example in enumerate(examples): 

148 # Extract question and answer using dataset methods 

149 question = dataset_instance.get_question(example) 

150 correct_answer = dataset_instance.get_answer(example) 

151 

152 logger.info( 

153 f"Processing {i + 1}/{len(examples)}: {question[:50]}..." 

154 ) 

155 

156 try: 

157 # Format query based on dataset type 

158 formatted_query = question # Simple format for SimpleQA 

159 

160 # Time the search 

161 start_time = time.time() 

162 

163 # Create search config from system_config 

164 search_params = { 

165 "iterations": system_config.get("iterations", 3), 

166 "questions_per_iteration": system_config.get( 

167 "questions_per_iteration", 3 

168 ), 

169 "search_tool": system_config.get( 

170 "search_tool", "searxng" 

171 ), 

172 # Note: search_strategy is stored in the config but not passed to quick_summary 

173 # as it's not supported by the underlying API 

174 } 

175 

176 # Get response from LDR 

177 from local_deep_research.api import quick_summary 

178 

179 search_result = quick_summary( 

180 query=formatted_query, 

181 iterations=search_params.get("iterations"), 

182 questions_per_iteration=search_params.get( 

183 "questions_per_iteration" 

184 ), 

185 search_tool=search_params.get("search_tool"), 

186 ) 

187 

188 end_time = time.time() 

189 processing_time = end_time - start_time 

190 

191 # Extract response 

192 response = search_result.get("summary", "") 

193 

194 # Extract structured answer 

195 from ..graders import extract_answer_from_response 

196 

197 extracted = extract_answer_from_response( 

198 response, "simpleqa" 

199 ) 

200 

201 # Format result 

202 result = { 

203 "id": example.get("id", f"example_{i}"), 

204 "problem": question, 

205 "correct_answer": correct_answer, 

206 "response": response, 

207 "extracted_answer": extracted["extracted_answer"], 

208 "confidence": extracted["confidence"], 

209 "processing_time": processing_time, 

210 "sources": search_result.get("sources", []), 

211 "search_config": search_params, 

212 } 

213 

214 # Add to results list 

215 results.append(result) 

216 

217 # Write result to file 

218 with open(results_file, "a") as f: 

219 f.write(json.dumps(result) + "\n") 

220 

221 except Exception as e: 

222 logger.exception(f"Error processing example {i + 1}: {e!s}") 

223 

224 # Create error result 

225 error_result = { 

226 "id": example.get("id", f"example_{i}"), 

227 "problem": question, 

228 "correct_answer": correct_answer, 

229 "error": str(e), 

230 "processing_time": 0, 

231 } 

232 

233 # Add to results list 

234 results.append(error_result) 

235 

236 # Write error result to file 

237 with open(results_file, "a") as f: 

238 f.write(json.dumps(error_result) + "\n") 

239 

240 # Grade results 

241 from ..graders import grade_results 

242 

243 grade_results( 

244 results_file=results_file, 

245 output_file=evaluation_file, 

246 dataset_type="simpleqa", 

247 ) 

248 

249 # Calculate metrics 

250 metrics = calculate_metrics(evaluation_file) 

251 

252 # Generate report 

253 dataset_name = "SimpleQA" 

254 report_path = generate_report( 

255 metrics=metrics, 

256 results_file=evaluation_file, 

257 output_file=report_file, 

258 dataset_name=dataset_name, 

259 config_info={ 

260 "Dataset": "SimpleQA", 

261 "Examples": len(examples), 

262 "Iterations": search_params.get("iterations", 3), 

263 "Questions per iteration": search_params.get( 

264 "questions_per_iteration", 3 

265 ), 

266 "Search tool": search_params.get("search_tool", "searxng"), 

267 "Search strategy": search_params.get( 

268 "search_strategy", "source_based" 

269 ), 

270 }, 

271 ) 

272 

273 # Return results 

274 return { 

275 "status": "complete", 

276 "dataset_type": "simpleqa", 

277 "results_path": results_file, 

278 "evaluation_path": evaluation_file, 

279 "report_path": report_path, 

280 "metrics": metrics, 

281 "total_examples": len(examples), 

282 "accuracy": metrics.get("accuracy", 0), 

283 } 

284 

285 except Exception as e: 

286 logger.exception(f"Error in direct dataset evaluation: {e!s}") 

287 return { 

288 "status": "error", 

289 "dataset_type": "simpleqa", 

290 "error": str(e), 

291 }