Coverage for src/local_deep_research/benchmarks/evaluators/simpleqa.py: 38%

1"""

2SimpleQA benchmark evaluator.

4This module provides a benchmark evaluator implementation for the SimpleQA

5benchmark, which tests simple question-answering capabilities.

6"""

8import json

9from loguru import logger

10from pathlib import Path

11import time

12from typing import Any, Dict

14from ..datasets.base import DatasetRegistry

15from ..metrics import calculate_metrics, generate_report

16from ..runners import run_simpleqa_benchmark # Keep for backward compatibility

17from .base import BaseBenchmarkEvaluator

20class SimpleQAEvaluator(BaseBenchmarkEvaluator):

21 """

22 Evaluator for the SimpleQA benchmark.

24 This evaluator runs the SimpleQA benchmark, which tests a system's ability

25 to accurately answer straightforward factual questions.

26 """

28 def __init__(self):

29 """Initialize the SimpleQA evaluator."""

30 super().__init__("simpleqa")

32 def evaluate(

33 self,

34 system_config: Dict[str, Any],

35 num_examples: int,

36 output_dir: str,

37 use_direct_dataset: bool = True,

38 ) -> Dict[str, Any]:

39 """

40 Run SimpleQA benchmark and return metrics.

42 Args:

43 system_config: Search and LLM configuration parameters

44 num_examples: Number of benchmark examples to run

45 output_dir: Directory to save evaluation results

46 use_direct_dataset: Whether to use dataset classes directly (recommended)

47 or fall back to runner functions

49 Returns:

50 Dictionary with metrics including quality_score based on accuracy

51 """

52 # Create benchmark-specific directory

53 benchmark_dir = self._create_subdirectory(output_dir)

55 # Log benchmark execution

56 logger.info(f"Running SimpleQA benchmark with {num_examples} examples")

58 try:

59 if use_direct_dataset:

60 # Use dataset classes directly (new approach)

61 results = self._run_with_dataset_class(

62 system_config=system_config,

63 num_examples=num_examples,

64 output_dir=benchmark_dir,

65 )

66 else:

67 # Fall back to legacy runner function

68 results = run_simpleqa_benchmark(

69 num_examples=num_examples,

70 output_dir=benchmark_dir,

71 search_config=system_config,

72 run_evaluation=True,

73 )

75 # Extract metrics

76 metrics = results.get("metrics", {})

77 accuracy = metrics.get("accuracy", 0.0)

79 # Return evaluation results with quality score

80 return {

81 "benchmark_type": self.name,

82 "accuracy": accuracy,

83 "quality_score": accuracy, # Map accuracy directly to quality score

84 "raw_results": results,

85 "report_path": results.get("report_path"),

86 }

88 except Exception as e:

89 logger.exception("Error in SimpleQA evaluation")

91 # Return error information

92 return {

93 "benchmark_type": self.name,

94 "error": str(e),

95 "quality_score": 0.0,

96 "accuracy": 0.0,

97 }

99 def _run_with_dataset_class(

100 self,

101 system_config: Dict[str, Any],

102 num_examples: int,

103 output_dir: str,

104 ) -> Dict[str, Any]:

105 """

106 Run SimpleQA benchmark using dataset classes directly.

107

108 This implementation directly uses the dataset classes rather than

109 going through the runner functions, allowing for more flexibility

110 and better integration with the object-oriented architecture.

111

112 Args:

113 system_config: Search and LLM configuration parameters

114 num_examples: Number of benchmark examples to run

115 output_dir: Directory to save evaluation results

116

117 Returns:

118 Dictionary with benchmark results

119 """

120 # Create a dataset instance using the registry

121 try:

122 dataset_instance = DatasetRegistry.create_dataset(

123 dataset_id="simpleqa",

124 num_examples=num_examples,

125 seed=system_config.get("seed", 42),

126 )

127

128 # Load dataset examples

129 examples = dataset_instance.load()

130 logger.info(f"Loaded {len(examples)} SimpleQA examples")

131

132 # Set up output files

133 timestamp = time.strftime("%Y%m%d_%H%M%S")

134 results_file = str(

135 Path(output_dir) / f"simpleqa_{timestamp}_results.jsonl"

136 )

137 evaluation_file = str(

138 Path(output_dir) / f"simpleqa_{timestamp}_evaluation.jsonl"

139 )

140 report_file = str(

141 Path(output_dir) / f"simpleqa_{timestamp}_report.md"

142 )

143

144 # Process each example

145 results = []

146

147 for i, example in enumerate(examples):

148 # Extract question and answer using dataset methods

149 question = dataset_instance.get_question(example)

150 correct_answer = dataset_instance.get_answer(example)

151

152 logger.info(

153 f"Processing {i + 1}/{len(examples)}: {question[:50]}..."

154 )

155

156 try:

157 # Format query based on dataset type

158 formatted_query = question # Simple format for SimpleQA

159

160 # Time the search

161 start_time = time.time()

162

163 # Create search config from system_config

164 search_params = {

165 "iterations": system_config.get("iterations", 3),

166 "questions_per_iteration": system_config.get(

167 "questions_per_iteration", 3

168 ),

169 "search_tool": system_config.get(

170 "search_tool", "searxng"

171 ),

172 # Note: search_strategy is stored in the config but not passed to quick_summary

173 # as it's not supported by the underlying API

174 }

175

176 # Get response from LDR

177 from local_deep_research.api import quick_summary

178

179 search_result = quick_summary(

180 query=formatted_query,

181 iterations=search_params.get("iterations"),

182 questions_per_iteration=search_params.get(

183 "questions_per_iteration"

184 ),

185 search_tool=search_params.get("search_tool"),

186 )

187

188 end_time = time.time()

189 processing_time = end_time - start_time

190

191 # Extract response

192 response = search_result.get("summary", "")

193

194 # Extract structured answer

195 from ..graders import extract_answer_from_response

196

197 extracted = extract_answer_from_response(

198 response, "simpleqa"

199 )

200

201 # Format result

202 result = {

203 "id": example.get("id", f"example_{i}"),

204 "problem": question,

205 "correct_answer": correct_answer,

206 "response": response,

207 "extracted_answer": extracted["extracted_answer"],

208 "confidence": extracted["confidence"],

209 "processing_time": processing_time,

210 "sources": search_result.get("sources", []),

211 "search_config": search_params,

212 }

213

214 # Add to results list

215 results.append(result)

216

217 # Write result to file

218 with open(results_file, "a") as f:

219 f.write(json.dumps(result) + "\n")

220

221 except Exception as e:

222 logger.exception(f"Error processing example {i + 1}")

223

224 # Create error result

225 error_result = {

226 "id": example.get("id", f"example_{i}"),

227 "problem": question,

228 "correct_answer": correct_answer,

229 "error": str(e),

230 "processing_time": 0,

231 }

232

233 # Add to results list

234 results.append(error_result)

235

236 # Write error result to file

237 with open(results_file, "a") as f:

238 f.write(json.dumps(error_result) + "\n")

239

240 # Grade results

241 from ..graders import grade_results

242

243 grade_results(

244 results_file=results_file,

245 output_file=evaluation_file,

246 dataset_type="simpleqa",

247 )

248

249 # Calculate metrics

250 metrics = calculate_metrics(evaluation_file)

251

252 # Generate report

253 dataset_name = "SimpleQA"

254 report_path = generate_report(

255 metrics=metrics,

256 results_file=evaluation_file,

257 output_file=report_file,

258 dataset_name=dataset_name,

259 config_info={

260 "Dataset": "SimpleQA",

261 "Examples": len(examples),

262 "Iterations": search_params.get("iterations", 3),

263 "Questions per iteration": search_params.get(

264 "questions_per_iteration", 3

265 ),

266 "Search tool": search_params.get("search_tool", "searxng"),

267 "Search strategy": search_params.get(

268 "search_strategy", "source_based"

269 ),

270 },

271 )

272

273 # Return results

274 return {

275 "status": "complete",

276 "dataset_type": "simpleqa",

277 "results_path": results_file,

278 "evaluation_path": evaluation_file,

279 "report_path": report_path,

280 "metrics": metrics,

281 "total_examples": len(examples),

282 "accuracy": metrics.get("accuracy", 0),

283 }

284

285 except Exception as e:

286 logger.exception("Error in direct dataset evaluation")

287 return {

288 "status": "error",

289 "dataset_type": "simpleqa",

290 "error": str(e),

291 }

Coverage for src / local_deep_research / benchmarks / evaluators / simpleqa.py: 38%

69 statements