Coverage for src / local_deep_research / benchmarks / benchmark_functions.py: 95%

131 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2API functions for benchmarking. 

3 

4This module provides functions for running benchmarks programmatically. 

5""" 

6 

7from pathlib import Path 

8from typing import Any, Dict, List, Optional 

9 

10from loguru import logger 

11 

12from ..config.thread_settings import get_setting_from_snapshot 

13from ..llm.providers.base import normalize_provider 

14 

15from ..benchmarks import ( 

16 calculate_metrics, 

17 generate_report, 

18 run_benchmark, 

19 run_browsecomp_benchmark, 

20 run_simpleqa_benchmark, 

21 run_xbench_deepsearch_benchmark, 

22) 

23 

24 

25def evaluate_simpleqa( 

26 num_examples: int = 100, 

27 search_iterations: int = 3, 

28 questions_per_iteration: int = 3, 

29 search_tool: str = "searxng", 

30 human_evaluation: bool = False, 

31 evaluation_model: Optional[str] = None, 

32 evaluation_provider: Optional[str] = None, 

33 output_dir: str = "benchmark_results", 

34 search_model: Optional[str] = None, 

35 search_provider: Optional[str] = None, 

36 endpoint_url: Optional[str] = None, 

37 search_strategy: str = "source_based", 

38) -> Dict[str, Any]: 

39 """ 

40 Run SimpleQA benchmark evaluation. 

41 

42 Args: 

43 num_examples: Number of examples to evaluate 

44 search_iterations: Number of search iterations per query 

45 questions_per_iteration: Number of questions per iteration 

46 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

47 human_evaluation: Whether to use human evaluation 

48 evaluation_model: Optional custom model for evaluation 

49 evaluation_provider: Optional custom provider for evaluation 

50 output_dir: Directory to save results 

51 search_model: Optional model to use for the search system 

52 search_provider: Optional provider to use for the search system 

53 endpoint_url: Optional endpoint URL for OpenRouter or other API services 

54 search_strategy: Search strategy to use (default: 'source_based') 

55 

56 Returns: 

57 Dictionary with benchmark results 

58 """ 

59 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples") 

60 

61 # Set up search configuration 

62 search_config = { 

63 "iterations": search_iterations, 

64 "questions_per_iteration": questions_per_iteration, 

65 "search_tool": search_tool, 

66 "search_strategy": search_strategy, 

67 } 

68 

69 # Add model configurations if provided 

70 if search_model: 

71 search_config["model_name"] = search_model 

72 if search_provider: 

73 search_config["provider"] = search_provider 

74 if endpoint_url: 

75 search_config["openai_endpoint_url"] = endpoint_url 

76 

77 # Check settings for additional configuration 

78 if env_model := get_setting_from_snapshot("llm.model"): 

79 search_config["model_name"] = env_model 

80 if env_provider := get_setting_from_snapshot("llm.provider"): 

81 search_config["provider"] = env_provider 

82 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

83 search_config["openai_endpoint_url"] = env_url 

84 

85 # Set up evaluation configuration if needed 

86 evaluation_config = None 

87 if evaluation_model or evaluation_provider: 

88 evaluation_config = { 

89 "temperature": 0 # Always use zero temperature for evaluation 

90 } 

91 if evaluation_model: 91 ↛ 93line 91 didn't jump to line 93 because the condition on line 91 was always true

92 evaluation_config["model_name"] = evaluation_model 

93 if evaluation_provider: 

94 evaluation_provider = normalize_provider(evaluation_provider) 

95 evaluation_config["provider"] = evaluation_provider 

96 # Add endpoint URL if using openai_endpoint 

97 if evaluation_provider == "openai_endpoint" and endpoint_url: 

98 evaluation_config["openai_endpoint_url"] = endpoint_url 

99 elif evaluation_provider == "openai_endpoint" and env_url: 

100 evaluation_config["openai_endpoint_url"] = env_url 

101 

102 # Run the benchmark 

103 return run_simpleqa_benchmark( 

104 num_examples=num_examples, 

105 output_dir=output_dir, 

106 search_config=search_config, 

107 evaluation_config=evaluation_config, 

108 human_evaluation=human_evaluation, 

109 ) 

110 

111 

112def evaluate_browsecomp( 

113 num_examples: int = 100, 

114 search_iterations: int = 3, 

115 questions_per_iteration: int = 3, 

116 search_tool: str = "searxng", 

117 human_evaluation: bool = False, 

118 evaluation_model: Optional[str] = None, 

119 evaluation_provider: Optional[str] = None, 

120 output_dir: str = "benchmark_results", 

121 search_model: Optional[str] = None, 

122 search_provider: Optional[str] = None, 

123 endpoint_url: Optional[str] = None, 

124 search_strategy: str = "source_based", 

125) -> Dict[str, Any]: 

126 """ 

127 Run BrowseComp benchmark evaluation. 

128 

129 Args: 

130 num_examples: Number of examples to evaluate 

131 search_iterations: Number of search iterations per query 

132 questions_per_iteration: Number of questions per iteration 

133 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

134 human_evaluation: Whether to use human evaluation 

135 evaluation_model: Optional custom model for evaluation 

136 evaluation_provider: Optional custom provider for evaluation 

137 output_dir: Directory to save results 

138 search_model: Optional model to use for the search system 

139 search_provider: Optional provider to use for the search system 

140 endpoint_url: Optional endpoint URL for OpenRouter or other API services 

141 search_strategy: Search strategy to use (default: 'source_based') 

142 

143 Returns: 

144 Dictionary with benchmark results 

145 """ 

146 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples") 

147 

148 # Set up search configuration 

149 search_config = { 

150 "iterations": search_iterations, 

151 "questions_per_iteration": questions_per_iteration, 

152 "search_tool": search_tool, 

153 "search_strategy": search_strategy, 

154 } 

155 

156 # Add model configurations if provided 

157 if search_model: 

158 search_config["model_name"] = search_model 

159 if search_provider: 

160 search_config["provider"] = search_provider 

161 if endpoint_url: 

162 search_config["openai_endpoint_url"] = endpoint_url 

163 

164 # Check settings for additional configuration 

165 if env_model := get_setting_from_snapshot("llm.model"): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true

166 search_config["model_name"] = env_model 

167 if env_provider := get_setting_from_snapshot("llm.provider"): 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 search_config["provider"] = env_provider 

169 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

170 search_config["openai_endpoint_url"] = env_url 

171 

172 # Set up evaluation configuration if needed 

173 evaluation_config = None 

174 if evaluation_model or evaluation_provider: 

175 evaluation_config = { 

176 "temperature": 0 # Always use zero temperature for evaluation 

177 } 

178 if evaluation_model: 

179 evaluation_config["model_name"] = evaluation_model 

180 if evaluation_provider: 180 ↛ 190line 180 didn't jump to line 190 because the condition on line 180 was always true

181 evaluation_provider = normalize_provider(evaluation_provider) 

182 evaluation_config["provider"] = evaluation_provider 

183 # Add endpoint URL if using openai_endpoint 

184 if evaluation_provider == "openai_endpoint" and endpoint_url: 

185 evaluation_config["openai_endpoint_url"] = endpoint_url 

186 elif evaluation_provider == "openai_endpoint" and env_url: 

187 evaluation_config["openai_endpoint_url"] = env_url 

188 

189 # Run the benchmark 

190 return run_browsecomp_benchmark( 

191 num_examples=num_examples, 

192 output_dir=output_dir, 

193 search_config=search_config, 

194 evaluation_config=evaluation_config, 

195 human_evaluation=human_evaluation, 

196 ) 

197 

198 

199def evaluate_xbench_deepsearch( 

200 num_examples: int = 100, 

201 search_iterations: int = 4, 

202 questions_per_iteration: int = 3, 

203 search_tool: str = "searxng", 

204 human_evaluation: bool = False, 

205 evaluation_model: Optional[str] = None, 

206 evaluation_provider: Optional[str] = None, 

207 output_dir: str = "benchmark_results", 

208 search_model: Optional[str] = None, 

209 search_provider: Optional[str] = None, 

210 endpoint_url: Optional[str] = None, 

211 search_strategy: str = "source_based", 

212) -> Dict[str, Any]: 

213 """ 

214 Run xbench-DeepSearch benchmark evaluation. 

215 

216 Args: 

217 num_examples: Number of examples to evaluate (default 100 - full dataset) 

218 search_iterations: Number of search iterations per query 

219 questions_per_iteration: Number of questions per iteration 

220 search_tool: Search engine to use 

221 human_evaluation: Whether to use human evaluation 

222 evaluation_model: Optional custom model for evaluation 

223 evaluation_provider: Optional custom provider for evaluation 

224 output_dir: Directory to save results 

225 search_model: Optional model to use for the search system 

226 search_provider: Optional provider to use for the search system 

227 endpoint_url: Optional endpoint URL for API services 

228 search_strategy: Search strategy to use 

229 

230 Returns: 

231 Dictionary with benchmark results 

232 """ 

233 logger.info( 

234 f"Starting xbench-DeepSearch benchmark with {num_examples} examples" 

235 ) 

236 

237 # Set up search configuration 

238 search_config = { 

239 "iterations": search_iterations, 

240 "questions_per_iteration": questions_per_iteration, 

241 "search_tool": search_tool, 

242 "search_strategy": search_strategy, 

243 } 

244 

245 # Add model configurations if provided 

246 if search_model: 

247 search_config["model_name"] = search_model 

248 if search_provider: 

249 search_config["provider"] = search_provider 

250 if endpoint_url: 

251 search_config["openai_endpoint_url"] = endpoint_url 

252 

253 # Check settings for additional configuration 

254 if env_model := get_setting_from_snapshot("llm.model"): 

255 search_config["model_name"] = env_model 

256 if env_provider := get_setting_from_snapshot("llm.provider"): 

257 search_config["provider"] = env_provider 

258 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

259 search_config["openai_endpoint_url"] = env_url 

260 

261 # Set up evaluation configuration if needed 

262 evaluation_config = None 

263 if evaluation_model or evaluation_provider: 

264 evaluation_config = { 

265 "temperature": 0 # Always use zero temperature for evaluation 

266 } 

267 if evaluation_model: 267 ↛ 269line 267 didn't jump to line 269 because the condition on line 267 was always true

268 evaluation_config["model_name"] = evaluation_model 

269 if evaluation_provider: 269 ↛ 279line 269 didn't jump to line 279 because the condition on line 269 was always true

270 evaluation_provider = normalize_provider(evaluation_provider) 

271 evaluation_config["provider"] = evaluation_provider 

272 # Add endpoint URL if using openai_endpoint 

273 if evaluation_provider == "openai_endpoint" and endpoint_url: 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true

274 evaluation_config["openai_endpoint_url"] = endpoint_url 

275 elif evaluation_provider == "openai_endpoint" and env_url: 

276 evaluation_config["openai_endpoint_url"] = env_url 

277 

278 # Run the benchmark 

279 return run_xbench_deepsearch_benchmark( 

280 num_examples=num_examples, 

281 output_dir=output_dir, 

282 search_config=search_config, 

283 evaluation_config=evaluation_config, 

284 human_evaluation=human_evaluation, 

285 ) 

286 

287 

288def get_available_benchmarks() -> List[Dict[str, str]]: 

289 """ 

290 Get information about available benchmarks. 

291 

292 Returns: 

293 List of dictionaries with benchmark information 

294 """ 

295 return [ 

296 { 

297 "id": "simpleqa", 

298 "name": "SimpleQA", 

299 "description": "Benchmark for factual question answering", 

300 "recommended_examples": 100, 

301 }, 

302 { 

303 "id": "browsecomp", 

304 "name": "BrowseComp", 

305 "description": "Benchmark for web browsing comprehension", 

306 "recommended_examples": 100, 

307 }, 

308 { 

309 "id": "xbench_deepsearch", 

310 "name": "xbench-DeepSearch", 

311 "description": "Deep research and search capability evaluation", 

312 "recommended_examples": 100, 

313 }, 

314 ] 

315 

316 

317def compare_configurations( 

318 dataset_type: str = "simpleqa", 

319 num_examples: int = 20, 

320 configurations: List[Dict[str, Any]] = None, 

321 output_dir: str = "benchmark_comparisons", 

322) -> Dict[str, Any]: 

323 """ 

324 Compare multiple search configurations on the same benchmark. 

325 

326 Args: 

327 dataset_type: Type of dataset to use 

328 num_examples: Number of examples to evaluate 

329 configurations: List of search configurations to compare 

330 output_dir: Directory to save results 

331 

332 Returns: 

333 Dictionary with comparison results 

334 """ 

335 if not configurations: 

336 # Default configurations to compare 

337 configurations = [ 

338 { 

339 "name": "Base Config", 

340 "search_tool": "searxng", 

341 "iterations": 1, 

342 "questions_per_iteration": 3, 

343 }, 

344 { 

345 "name": "More Iterations", 

346 "search_tool": "searxng", 

347 "iterations": 3, 

348 "questions_per_iteration": 3, 

349 }, 

350 { 

351 "name": "More Questions", 

352 "search_tool": "searxng", 

353 "iterations": 1, 

354 "questions_per_iteration": 5, 

355 }, 

356 ] 

357 

358 # Create output directory 

359 

360 Path(output_dir).mkdir(parents=True, exist_ok=True) 

361 

362 # Run benchmarks for each configuration 

363 results = [] 

364 for config in configurations: 

365 config_name = config.pop("name", f"Config-{len(results)}") 

366 

367 logger.info(f"Running benchmark with configuration: {config_name}") 

368 

369 search_config = { 

370 "iterations": config.pop("iterations", 1), 

371 "questions_per_iteration": config.pop("questions_per_iteration", 3), 

372 "search_tool": config.pop("search_tool", "searxng"), 

373 } 

374 

375 # Add any remaining config items 

376 search_config.update(config) 

377 

378 # Run benchmark with this configuration 

379 benchmark_result = run_benchmark( 

380 dataset_type=dataset_type, 

381 num_examples=num_examples, 

382 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")), 

383 search_config=search_config, 

384 run_evaluation=True, 

385 ) 

386 

387 # Add configuration name to results 

388 benchmark_result["configuration_name"] = config_name 

389 benchmark_result["search_config"] = search_config 

390 

391 results.append(benchmark_result) 

392 

393 # Generate comparison report 

394 import time 

395 

396 from ..security.file_write_verifier import write_file_verified 

397 

398 timestamp = time.strftime("%Y%m%d_%H%M%S") 

399 report_file = str( 

400 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md" 

401 ) 

402 

403 # Build report content 

404 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n" 

405 

406 # Write summary table 

407 content += "## Summary\n\n" 

408 content += "| Configuration | Accuracy | Avg. Time | Examples |\n" 

409 content += "|---------------|----------|-----------|----------|\n" 

410 

411 for result in results: 

412 accuracy = result.get("metrics", {}).get("accuracy", 0) 

413 avg_time = result.get("metrics", {}).get("average_processing_time", 0) 

414 examples = result.get("total_examples", 0) 

415 

416 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n" 

417 

418 content += "\n## Configuration Details\n\n" 

419 

420 for result in results: 

421 content += f"### {result['configuration_name']}\n\n" 

422 

423 config = result.get("search_config", {}) 

424 content += "```\n" 

425 for key, value in config.items(): 

426 content += f"{key}: {value}\n" 

427 content += "```\n\n" 

428 

429 write_file_verified( 

430 report_file, 

431 content, 

432 "benchmark.allow_file_output", 

433 context="benchmark comparison report", 

434 ) 

435 

436 logger.info(f"Comparison report saved to {report_file}") 

437 

438 return { 

439 "status": "complete", 

440 "dataset_type": dataset_type, 

441 "configurations_tested": len(configurations), 

442 "report_path": report_file, 

443 "results": results, 

444 } 

445 

446 

447# Export the API functions 

448__all__ = [ 

449 "calculate_metrics", 

450 "compare_configurations", 

451 "evaluate_browsecomp", 

452 "evaluate_simpleqa", 

453 "generate_report", 

454 "get_available_benchmarks", 

455 "run_benchmark", # For advanced users 

456]