Coverage for src/local_deep_research/api/benchmark

1"""

2API functions for benchmarking.

4This module provides functions for running benchmarks programmatically.

5"""

7from loguru import logger

8from pathlib import Path

9from typing import Any

11from ..benchmarks import (

12 calculate_metrics,

13 generate_report,

14 run_benchmark,

15 run_browsecomp_benchmark,

16 run_simpleqa_benchmark,

17 run_xbench_deepsearch_benchmark,

18)

21def evaluate_simpleqa(

22 num_examples: int = 100,

23 search_iterations: int = 3,

24 questions_per_iteration: int = 3,

25 search_tool: str = "searxng",

26 human_evaluation: bool = False,

27 evaluation_model: str | None = None,

28 evaluation_provider: str | None = None,

29 output_dir: str = "benchmark_results",

30) -> dict[str, Any]:

31 """

32 Run SimpleQA benchmark evaluation.

34 Args:

35 num_examples: Number of examples to evaluate

36 search_iterations: Number of search iterations per query

37 questions_per_iteration: Number of questions per iteration

38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')

39 human_evaluation: Whether to use human evaluation

40 evaluation_model: Optional custom model for evaluation

41 evaluation_provider: Optional custom provider for evaluation

42 output_dir: Directory to save results

44 Returns:

45 Dictionary with benchmark results

46 """

47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")

49 # Set up search configuration

50 search_config = {

51 "iterations": search_iterations,

52 "questions_per_iteration": questions_per_iteration,

53 "search_tool": search_tool,

54 }

56 # Set up evaluation configuration if needed

57 evaluation_config = None

58 if evaluation_model or evaluation_provider:

59 evaluation_config = {}

60 if evaluation_model: 60 ↛ 62line 60 didn't jump to line 62 because the condition on line 60 was always true

61 evaluation_config["model_name"] = evaluation_model

62 if evaluation_provider: 62 ↛ 66line 62 didn't jump to line 66 because the condition on line 62 was always true

63 evaluation_config["provider"] = evaluation_provider

65 # Run the benchmark

66 results = run_simpleqa_benchmark(

67 num_examples=num_examples,

68 output_dir=output_dir,

69 search_config=search_config,

70 evaluation_config=evaluation_config,

71 human_evaluation=human_evaluation,

72 )

74 return results

77def evaluate_browsecomp(

78 num_examples: int = 100,

79 search_iterations: int = 3,

80 questions_per_iteration: int = 3,

81 search_tool: str = "searxng",

82 human_evaluation: bool = False,

83 evaluation_model: str | None = None,

84 evaluation_provider: str | None = None,

85 output_dir: str = "benchmark_results",

86) -> dict[str, Any]:

87 """

88 Run BrowseComp benchmark evaluation.

90 Args:

91 num_examples: Number of examples to evaluate

92 search_iterations: Number of search iterations per query

93 questions_per_iteration: Number of questions per iteration

94 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')

95 human_evaluation: Whether to use human evaluation

96 evaluation_model: Optional custom model for evaluation

97 evaluation_provider: Optional custom provider for evaluation

98 output_dir: Directory to save results

100 Returns:

101 Dictionary with benchmark results

102 """

103 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")

104

105 # Set up search configuration

106 search_config = {

107 "iterations": search_iterations,

108 "questions_per_iteration": questions_per_iteration,

109 "search_tool": search_tool,

110 }

111

112 # Set up evaluation configuration if needed

113 evaluation_config = None

114 if evaluation_model or evaluation_provider: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 evaluation_config = {}

116 if evaluation_model:

117 evaluation_config["model_name"] = evaluation_model

118 if evaluation_provider:

119 evaluation_config["provider"] = evaluation_provider

120

121 # Run the benchmark

122 results = run_browsecomp_benchmark(

123 num_examples=num_examples,

124 output_dir=output_dir,

125 search_config=search_config,

126 evaluation_config=evaluation_config,

127 human_evaluation=human_evaluation,

128 )

129

130 return results

131

132

133def evaluate_xbench_deepsearch(

134 num_examples: int = 100,

135 search_iterations: int = 4,

136 questions_per_iteration: int = 3,

137 search_tool: str = "searxng",

138 human_evaluation: bool = False,

139 evaluation_model: str | None = None,

140 evaluation_provider: str | None = None,

141 output_dir: str = "benchmark_results",

142) -> dict[str, Any]:

143 """

144 Run xbench-DeepSearch benchmark evaluation.

145

146 Args:

147 num_examples: Number of examples to evaluate (default 100 - full dataset)

148 search_iterations: Number of search iterations per query

149 questions_per_iteration: Number of questions per iteration

150 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')

151 human_evaluation: Whether to use human evaluation

152 evaluation_model: Optional custom model for evaluation

153 evaluation_provider: Optional custom provider for evaluation

154 output_dir: Directory to save results

155

156 Returns:

157 Dictionary with benchmark results

158 """

159 logger.info(

160 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"

161 )

162

163 # Set up search configuration

164 search_config = {

165 "iterations": search_iterations,

166 "questions_per_iteration": questions_per_iteration,

167 "search_tool": search_tool,

168 }

169

170 # Set up evaluation configuration if needed

171 evaluation_config = None

172 if evaluation_model or evaluation_provider: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 evaluation_config = {}

174 if evaluation_model:

175 evaluation_config["model_name"] = evaluation_model

176 if evaluation_provider:

177 evaluation_config["provider"] = evaluation_provider

178

179 # Run the benchmark

180 results = run_xbench_deepsearch_benchmark(

181 num_examples=num_examples,

182 output_dir=output_dir,

183 search_config=search_config,

184 evaluation_config=evaluation_config,

185 human_evaluation=human_evaluation,

186 )

187

188 return results

189

190

191def get_available_benchmarks() -> list[dict[str, str]]:

192 """

193 Get information about available benchmarks.

194

195 Returns:

196 List of dictionaries with benchmark information

197 """

198 return [

199 {

200 "id": "simpleqa",

201 "name": "SimpleQA",

202 "description": "Benchmark for factual question answering",

203 "recommended_examples": 100,

204 },

205 {

206 "id": "browsecomp",

207 "name": "BrowseComp",

208 "description": "Benchmark for web browsing comprehension",

209 "recommended_examples": 100,

210 },

211 {

212 "id": "xbench_deepsearch",

213 "name": "xbench-DeepSearch",

214 "description": "Benchmark for deep search and investigation queries",

215 "recommended_examples": 100,

216 },

217 ]

218

219

220def compare_configurations(

221 dataset_type: str = "simpleqa",

222 num_examples: int = 20,

223 configurations: list[dict[str, Any]] = None,

224 output_dir: str = "benchmark_comparisons",

225) -> dict[str, Any]:

226 """

227 Compare multiple search configurations on the same benchmark.

228

229 Args:

230 dataset_type: Type of dataset to use

231 num_examples: Number of examples to evaluate

232 configurations: List of search configurations to compare

233 output_dir: Directory to save results

234

235 Returns:

236 Dictionary with comparison results

237 """

238 if not configurations:

239 # Default configurations to compare

240 configurations = [

241 {

242 "name": "Base Config",

243 "search_tool": "searxng",

244 "iterations": 1,

245 "questions_per_iteration": 3,

246 },

247 {

248 "name": "More Iterations",

249 "search_tool": "searxng",

250 "iterations": 3,

251 "questions_per_iteration": 3,

252 },

253 {

254 "name": "More Questions",

255 "search_tool": "searxng",

256 "iterations": 1,

257 "questions_per_iteration": 5,

258 },

259 ]

260

261 # Create output directory

262 import os

263

264 os.makedirs(output_dir, exist_ok=True)

265

266 # Run benchmarks for each configuration

267 results = []

268 for config in configurations:

269 config_name = config.pop("name", f"Config-{len(results)}")

270

271 logger.info(f"Running benchmark with configuration: {config_name}")

272

273 search_config = {

274 "iterations": config.pop("iterations", 1),

275 "questions_per_iteration": config.pop("questions_per_iteration", 3),

276 "search_tool": config.pop("search_tool", "searxng"),

277 }

278

279 # Add any remaining config items

280 for key, value in config.items(): 280 ↛ 281line 280 didn't jump to line 281 because the loop on line 280 never started

281 search_config[key] = value

282

283 # Run benchmark with this configuration

284 benchmark_result = run_benchmark(

285 dataset_type=dataset_type,

286 num_examples=num_examples,

287 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),

288 search_config=search_config,

289 run_evaluation=True,

290 )

291

292 # Add configuration name to results

293 benchmark_result["configuration_name"] = config_name

294 benchmark_result["search_config"] = search_config

295

296 results.append(benchmark_result)

297

298 # Generate comparison report

299 import time

300 from ..security.file_write_verifier import write_file_verified

301

302 timestamp = time.strftime("%Y%m%d_%H%M%S")

303 report_file = str(

304 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"

305 )

306

307 # Build report content

308 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"

309

310 # Write summary table

311 content += "## Summary\n\n"

313 content += "|---------------|----------|-----------|----------|\n"

314

315 for result in results:

316 accuracy = result.get("metrics", {}).get("accuracy", 0)

317 avg_time = result.get("metrics", {}).get("average_processing_time", 0)

318 examples = result.get("total_examples", 0)

319

320 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"

321

322 content += "\n## Configuration Details\n\n"

323

324 for result in results:

325 content += f"### {result['configuration_name']}\n\n"

326

327 config = result.get("search_config", {})

328 content += "```\n"

329 for key, value in config.items():

330 content += f"{key}: {value}\n"

331 content += "```\n\n"

332

333 write_file_verified(

334 report_file,

335 content,

336 "benchmark.allow_file_output",

337 context="benchmark comparison report",

338 )

339

340 logger.info(f"Comparison report saved to {report_file}")

341

342 return {

343 "status": "complete",

344 "dataset_type": dataset_type,

345 "configurations_tested": len(configurations),

346 "report_path": report_file,

347 "results": results,

348 }

349

350

351# Export the API functions

352__all__ = [

353 "calculate_metrics",

354 "compare_configurations",

355 "evaluate_browsecomp",

356 "evaluate_simpleqa",

357 "evaluate_xbench_deepsearch",

358 "generate_report",

359 "get_available_benchmarks",

360 "run_benchmark", # For advanced users

361]

Coverage for src / local_deep_research / api / benchmark_functions.py: 79%

83 statements