Coverage for src/local_deep_research/benchmarks/benchmark

1"""

2API functions for benchmarking.

4This module provides functions for running benchmarks programmatically.

5"""

7from pathlib import Path

8from typing import Any, Dict, List, Optional

10from loguru import logger

12from ..config.thread_settings import get_setting_from_snapshot

14from ..benchmarks import (

15 calculate_metrics,

16 generate_report,

17 run_benchmark,

18 run_browsecomp_benchmark,

19 run_simpleqa_benchmark,

20 run_xbench_deepsearch_benchmark,

21)

24def evaluate_simpleqa(

25 num_examples: int = 100,

26 search_iterations: int = 3,

27 questions_per_iteration: int = 3,

28 search_tool: str = "searxng",

29 human_evaluation: bool = False,

30 evaluation_model: Optional[str] = None,

31 evaluation_provider: Optional[str] = None,

32 output_dir: str = "benchmark_results",

33 search_model: Optional[str] = None,

34 search_provider: Optional[str] = None,

35 endpoint_url: Optional[str] = None,

36 search_strategy: str = "source_based",

37) -> Dict[str, Any]:

38 """

39 Run SimpleQA benchmark evaluation.

41 Args:

42 num_examples: Number of examples to evaluate

43 search_iterations: Number of search iterations per query

44 questions_per_iteration: Number of questions per iteration

45 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')

46 human_evaluation: Whether to use human evaluation

47 evaluation_model: Optional custom model for evaluation

48 evaluation_provider: Optional custom provider for evaluation

49 output_dir: Directory to save results

50 search_model: Optional model to use for the search system

51 search_provider: Optional provider to use for the search system

52 endpoint_url: Optional endpoint URL for OpenRouter or other API services

53 search_strategy: Search strategy to use (default: 'source_based')

55 Returns:

56 Dictionary with benchmark results

57 """

58 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")

60 # Set up search configuration

61 search_config = {

62 "iterations": search_iterations,

63 "questions_per_iteration": questions_per_iteration,

64 "search_tool": search_tool,

65 "search_strategy": search_strategy,

66 }

68 # Add model configurations if provided

69 if search_model: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 search_config["model_name"] = search_model

71 if search_provider: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 search_config["provider"] = search_provider

73 if endpoint_url: 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 search_config["openai_endpoint_url"] = endpoint_url

76 # Check settings for additional configuration

77 if env_model := get_setting_from_snapshot("llm.model"):

78 search_config["model_name"] = env_model

79 if env_provider := get_setting_from_snapshot("llm.provider"):

80 search_config["provider"] = env_provider

81 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):

82 search_config["openai_endpoint_url"] = env_url

84 # Set up evaluation configuration if needed

85 evaluation_config = None

86 if evaluation_model or evaluation_provider:

87 evaluation_config = {

88 "temperature": 0 # Always use zero temperature for evaluation

89 }

90 if evaluation_model: 90 ↛ 92line 90 didn't jump to line 92 because the condition on line 90 was always true

91 evaluation_config["model_name"] = evaluation_model

92 if evaluation_provider: 92 ↛ 101line 92 didn't jump to line 101 because the condition on line 92 was always true

93 evaluation_config["provider"] = evaluation_provider

94 # Add endpoint URL if using openai_endpoint

95 if evaluation_provider == "openai_endpoint" and endpoint_url: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 evaluation_config["openai_endpoint_url"] = endpoint_url

97 elif evaluation_provider == "openai_endpoint" and env_url: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 evaluation_config["openai_endpoint_url"] = env_url

100 # Run the benchmark

101 results = run_simpleqa_benchmark(

102 num_examples=num_examples,

103 output_dir=output_dir,

104 search_config=search_config,

105 evaluation_config=evaluation_config,

106 human_evaluation=human_evaluation,

107 )

108

109 return results

110

111

112def evaluate_browsecomp(

113 num_examples: int = 100,

114 search_iterations: int = 3,

115 questions_per_iteration: int = 3,

116 search_tool: str = "searxng",

117 human_evaluation: bool = False,

118 evaluation_model: Optional[str] = None,

119 evaluation_provider: Optional[str] = None,

120 output_dir: str = "benchmark_results",

121 search_model: Optional[str] = None,

122 search_provider: Optional[str] = None,

123 endpoint_url: Optional[str] = None,

124 search_strategy: str = "source_based",

125) -> Dict[str, Any]:

126 """

127 Run BrowseComp benchmark evaluation.

128

129 Args:

130 num_examples: Number of examples to evaluate

131 search_iterations: Number of search iterations per query

132 questions_per_iteration: Number of questions per iteration

133 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')

134 human_evaluation: Whether to use human evaluation

135 evaluation_model: Optional custom model for evaluation

136 evaluation_provider: Optional custom provider for evaluation

137 output_dir: Directory to save results

138 search_model: Optional model to use for the search system

139 search_provider: Optional provider to use for the search system

140 endpoint_url: Optional endpoint URL for OpenRouter or other API services

141 search_strategy: Search strategy to use (default: 'source_based')

142

143 Returns:

144 Dictionary with benchmark results

145 """

146 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")

147

148 # Set up search configuration

149 search_config = {

150 "iterations": search_iterations,

151 "questions_per_iteration": questions_per_iteration,

152 "search_tool": search_tool,

153 "search_strategy": search_strategy,

154 }

155

156 # Add model configurations if provided

157 if search_model: 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 search_config["model_name"] = search_model

159 if search_provider: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 search_config["provider"] = search_provider

161 if endpoint_url: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 search_config["openai_endpoint_url"] = endpoint_url

163

164 # Check settings for additional configuration

165 if env_model := get_setting_from_snapshot("llm.model"): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 search_config["model_name"] = env_model

167 if env_provider := get_setting_from_snapshot("llm.provider"): 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 search_config["provider"] = env_provider

169 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 search_config["openai_endpoint_url"] = env_url

171

172 # Set up evaluation configuration if needed

173 evaluation_config = None

174 if evaluation_model or evaluation_provider: 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 evaluation_config = {

176 "temperature": 0 # Always use zero temperature for evaluation

177 }

178 if evaluation_model:

179 evaluation_config["model_name"] = evaluation_model

180 if evaluation_provider:

181 evaluation_config["provider"] = evaluation_provider

182 # Add endpoint URL if using openai_endpoint

183 if evaluation_provider == "openai_endpoint" and endpoint_url:

184 evaluation_config["openai_endpoint_url"] = endpoint_url

185 elif evaluation_provider == "openai_endpoint" and env_url:

186 evaluation_config["openai_endpoint_url"] = env_url

187

188 # Run the benchmark

189 results = run_browsecomp_benchmark(

190 num_examples=num_examples,

191 output_dir=output_dir,

192 search_config=search_config,

193 evaluation_config=evaluation_config,

194 human_evaluation=human_evaluation,

195 )

196

197 return results

198

199

200def evaluate_xbench_deepsearch(

201 num_examples: int = 100,

202 search_iterations: int = 4,

203 questions_per_iteration: int = 3,

204 search_tool: str = "searxng",

205 human_evaluation: bool = False,

206 evaluation_model: Optional[str] = None,

207 evaluation_provider: Optional[str] = None,

208 output_dir: str = "benchmark_results",

209 search_model: Optional[str] = None,

210 search_provider: Optional[str] = None,

211 endpoint_url: Optional[str] = None,

212 search_strategy: str = "source_based",

213) -> Dict[str, Any]:

214 """

215 Run xbench-DeepSearch benchmark evaluation.

216

217 Args:

218 num_examples: Number of examples to evaluate (default 100 - full dataset)

219 search_iterations: Number of search iterations per query

220 questions_per_iteration: Number of questions per iteration

221 search_tool: Search engine to use

222 human_evaluation: Whether to use human evaluation

223 evaluation_model: Optional custom model for evaluation

224 evaluation_provider: Optional custom provider for evaluation

225 output_dir: Directory to save results

226 search_model: Optional model to use for the search system

227 search_provider: Optional provider to use for the search system

228 endpoint_url: Optional endpoint URL for API services

229 search_strategy: Search strategy to use

230

231 Returns:

232 Dictionary with benchmark results

233 """

234 logger.info(

235 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"

236 )

237

238 # Set up search configuration

239 search_config = {

240 "iterations": search_iterations,

241 "questions_per_iteration": questions_per_iteration,

242 "search_tool": search_tool,

243 "search_strategy": search_strategy,

244 }

245

246 # Add model configurations if provided

247 if search_model: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 search_config["model_name"] = search_model

249 if search_provider: 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true

250 search_config["provider"] = search_provider

251 if endpoint_url: 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 search_config["openai_endpoint_url"] = endpoint_url

253

254 # Check settings for additional configuration

255 if env_model := get_setting_from_snapshot("llm.model"): 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 search_config["model_name"] = env_model

257 if env_provider := get_setting_from_snapshot("llm.provider"): 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 search_config["provider"] = env_provider

259 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 search_config["openai_endpoint_url"] = env_url

261

262 # Set up evaluation configuration if needed

263 evaluation_config = None

264 if evaluation_model or evaluation_provider: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 evaluation_config = {

266 "temperature": 0 # Always use zero temperature for evaluation

267 }

268 if evaluation_model:

269 evaluation_config["model_name"] = evaluation_model

270 if evaluation_provider:

271 evaluation_config["provider"] = evaluation_provider

272 # Add endpoint URL if using openai_endpoint

273 if evaluation_provider == "openai_endpoint" and endpoint_url:

274 evaluation_config["openai_endpoint_url"] = endpoint_url

275 elif evaluation_provider == "openai_endpoint" and env_url:

276 evaluation_config["openai_endpoint_url"] = env_url

277

278 # Run the benchmark

279 results = run_xbench_deepsearch_benchmark(

280 num_examples=num_examples,

281 output_dir=output_dir,

282 search_config=search_config,

283 evaluation_config=evaluation_config,

284 human_evaluation=human_evaluation,

285 )

286

287 return results

288

289

290def get_available_benchmarks() -> List[Dict[str, str]]:

291 """

292 Get information about available benchmarks.

293

294 Returns:

295 List of dictionaries with benchmark information

296 """

297 return [

298 {

299 "id": "simpleqa",

300 "name": "SimpleQA",

301 "description": "Benchmark for factual question answering",

302 "recommended_examples": 100,

303 },

304 {

305 "id": "browsecomp",

306 "name": "BrowseComp",

307 "description": "Benchmark for web browsing comprehension",

308 "recommended_examples": 100,

309 },

310 {

311 "id": "xbench_deepsearch",

312 "name": "xbench-DeepSearch",

313 "description": "Deep research and search capability evaluation",

314 "recommended_examples": 100,

315 },

316 ]

317

318

319def compare_configurations(

320 dataset_type: str = "simpleqa",

321 num_examples: int = 20,

322 configurations: List[Dict[str, Any]] = None,

323 output_dir: str = "benchmark_comparisons",

324) -> Dict[str, Any]:

325 """

326 Compare multiple search configurations on the same benchmark.

327

328 Args:

329 dataset_type: Type of dataset to use

330 num_examples: Number of examples to evaluate

331 configurations: List of search configurations to compare

332 output_dir: Directory to save results

333

334 Returns:

335 Dictionary with comparison results

336 """

337 if not configurations:

338 # Default configurations to compare

339 configurations = [

340 {

341 "name": "Base Config",

342 "search_tool": "searxng",

343 "iterations": 1,

344 "questions_per_iteration": 3,

345 },

346 {

347 "name": "More Iterations",

348 "search_tool": "searxng",

349 "iterations": 3,

350 "questions_per_iteration": 3,

351 },

352 {

353 "name": "More Questions",

354 "search_tool": "searxng",

355 "iterations": 1,

356 "questions_per_iteration": 5,

357 },

358 ]

359

360 # Create output directory

361

362 Path(output_dir).mkdir(parents=True, exist_ok=True)

363

364 # Run benchmarks for each configuration

365 results = []

366 for config in configurations:

367 config_name = config.pop("name", f"Config-{len(results)}")

368

369 logger.info(f"Running benchmark with configuration: {config_name}")

370

371 search_config = {

372 "iterations": config.pop("iterations", 1),

373 "questions_per_iteration": config.pop("questions_per_iteration", 3),

374 "search_tool": config.pop("search_tool", "searxng"),

375 }

376

377 # Add any remaining config items

378 for key, value in config.items(): 378 ↛ 379line 378 didn't jump to line 379 because the loop on line 378 never started

379 search_config[key] = value

380

381 # Run benchmark with this configuration

382 benchmark_result = run_benchmark(

383 dataset_type=dataset_type,

384 num_examples=num_examples,

385 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),

386 search_config=search_config,

387 run_evaluation=True,

388 )

389

390 # Add configuration name to results

391 benchmark_result["configuration_name"] = config_name

392 benchmark_result["search_config"] = search_config

393

394 results.append(benchmark_result)

395

396 # Generate comparison report

397 import time

398

399 from ..security.file_write_verifier import write_file_verified

400

401 timestamp = time.strftime("%Y%m%d_%H%M%S")

402 report_file = str(

403 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"

404 )

405

406 # Build report content

407 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"

408

409 # Write summary table

410 content += "## Summary\n\n"

412 content += "|---------------|----------|-----------|----------|\n"

413

414 for result in results:

415 accuracy = result.get("metrics", {}).get("accuracy", 0)

416 avg_time = result.get("metrics", {}).get("average_processing_time", 0)

417 examples = result.get("total_examples", 0)

418

419 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"

420

421 content += "\n## Configuration Details\n\n"

422

423 for result in results:

424 content += f"### {result['configuration_name']}\n\n"

425

426 config = result.get("search_config", {})

427 content += "```\n"

428 for key, value in config.items():

429 content += f"{key}: {value}\n"

430 content += "```\n\n"

431

432 write_file_verified(

433 report_file,

434 content,

435 "benchmark.allow_file_output",

436 context="benchmark comparison report",

437 )

438

439 logger.info(f"Comparison report saved to {report_file}")

440

441 return {

442 "status": "complete",

443 "dataset_type": dataset_type,

444 "configurations_tested": len(configurations),

445 "report_path": report_file,

446 "results": results,

447 }

448

449

450# Export the API functions

451__all__ = [

452 "calculate_metrics",

453 "compare_configurations",

454 "evaluate_browsecomp",

455 "evaluate_simpleqa",

456 "generate_report",

457 "get_available_benchmarks",

458 "run_benchmark", # For advanced users

459]

Coverage for src / local_deep_research / benchmarks / benchmark_functions.py: 65%

131 statements