Coverage for src / local_deep_research / benchmarks / benchmark_functions.py: 0%

131 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2API functions for benchmarking. 

3 

4This module provides functions for running benchmarks programmatically. 

5""" 

6 

7from pathlib import Path 

8from typing import Any, Dict, List, Optional 

9 

10from loguru import logger 

11 

12from ..config.thread_settings import get_setting_from_snapshot 

13 

14from ..benchmarks import ( 

15 calculate_metrics, 

16 generate_report, 

17 run_benchmark, 

18 run_browsecomp_benchmark, 

19 run_simpleqa_benchmark, 

20 run_xbench_deepsearch_benchmark, 

21) 

22 

23 

24def evaluate_simpleqa( 

25 num_examples: int = 100, 

26 search_iterations: int = 3, 

27 questions_per_iteration: int = 3, 

28 search_tool: str = "searxng", 

29 human_evaluation: bool = False, 

30 evaluation_model: Optional[str] = None, 

31 evaluation_provider: Optional[str] = None, 

32 output_dir: str = "benchmark_results", 

33 search_model: Optional[str] = None, 

34 search_provider: Optional[str] = None, 

35 endpoint_url: Optional[str] = None, 

36 search_strategy: str = "source_based", 

37) -> Dict[str, Any]: 

38 """ 

39 Run SimpleQA benchmark evaluation. 

40 

41 Args: 

42 num_examples: Number of examples to evaluate 

43 search_iterations: Number of search iterations per query 

44 questions_per_iteration: Number of questions per iteration 

45 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

46 human_evaluation: Whether to use human evaluation 

47 evaluation_model: Optional custom model for evaluation 

48 evaluation_provider: Optional custom provider for evaluation 

49 output_dir: Directory to save results 

50 search_model: Optional model to use for the search system 

51 search_provider: Optional provider to use for the search system 

52 endpoint_url: Optional endpoint URL for OpenRouter or other API services 

53 search_strategy: Search strategy to use (default: 'source_based') 

54 

55 Returns: 

56 Dictionary with benchmark results 

57 """ 

58 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples") 

59 

60 # Set up search configuration 

61 search_config = { 

62 "iterations": search_iterations, 

63 "questions_per_iteration": questions_per_iteration, 

64 "search_tool": search_tool, 

65 "search_strategy": search_strategy, 

66 } 

67 

68 # Add model configurations if provided 

69 if search_model: 

70 search_config["model_name"] = search_model 

71 if search_provider: 

72 search_config["provider"] = search_provider 

73 if endpoint_url: 

74 search_config["openai_endpoint_url"] = endpoint_url 

75 

76 # Check settings for additional configuration 

77 if env_model := get_setting_from_snapshot("llm.model"): 

78 search_config["model_name"] = env_model 

79 if env_provider := get_setting_from_snapshot("llm.provider"): 

80 search_config["provider"] = env_provider 

81 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

82 search_config["openai_endpoint_url"] = env_url 

83 

84 # Set up evaluation configuration if needed 

85 evaluation_config = None 

86 if evaluation_model or evaluation_provider: 

87 evaluation_config = { 

88 "temperature": 0 # Always use zero temperature for evaluation 

89 } 

90 if evaluation_model: 

91 evaluation_config["model_name"] = evaluation_model 

92 if evaluation_provider: 

93 evaluation_config["provider"] = evaluation_provider 

94 # Add endpoint URL if using openai_endpoint 

95 if evaluation_provider == "openai_endpoint" and endpoint_url: 

96 evaluation_config["openai_endpoint_url"] = endpoint_url 

97 elif evaluation_provider == "openai_endpoint" and env_url: 

98 evaluation_config["openai_endpoint_url"] = env_url 

99 

100 # Run the benchmark 

101 results = run_simpleqa_benchmark( 

102 num_examples=num_examples, 

103 output_dir=output_dir, 

104 search_config=search_config, 

105 evaluation_config=evaluation_config, 

106 human_evaluation=human_evaluation, 

107 ) 

108 

109 return results 

110 

111 

112def evaluate_browsecomp( 

113 num_examples: int = 100, 

114 search_iterations: int = 3, 

115 questions_per_iteration: int = 3, 

116 search_tool: str = "searxng", 

117 human_evaluation: bool = False, 

118 evaluation_model: Optional[str] = None, 

119 evaluation_provider: Optional[str] = None, 

120 output_dir: str = "benchmark_results", 

121 search_model: Optional[str] = None, 

122 search_provider: Optional[str] = None, 

123 endpoint_url: Optional[str] = None, 

124 search_strategy: str = "source_based", 

125) -> Dict[str, Any]: 

126 """ 

127 Run BrowseComp benchmark evaluation. 

128 

129 Args: 

130 num_examples: Number of examples to evaluate 

131 search_iterations: Number of search iterations per query 

132 questions_per_iteration: Number of questions per iteration 

133 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

134 human_evaluation: Whether to use human evaluation 

135 evaluation_model: Optional custom model for evaluation 

136 evaluation_provider: Optional custom provider for evaluation 

137 output_dir: Directory to save results 

138 search_model: Optional model to use for the search system 

139 search_provider: Optional provider to use for the search system 

140 endpoint_url: Optional endpoint URL for OpenRouter or other API services 

141 search_strategy: Search strategy to use (default: 'source_based') 

142 

143 Returns: 

144 Dictionary with benchmark results 

145 """ 

146 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples") 

147 

148 # Set up search configuration 

149 search_config = { 

150 "iterations": search_iterations, 

151 "questions_per_iteration": questions_per_iteration, 

152 "search_tool": search_tool, 

153 "search_strategy": search_strategy, 

154 } 

155 

156 # Add model configurations if provided 

157 if search_model: 

158 search_config["model_name"] = search_model 

159 if search_provider: 

160 search_config["provider"] = search_provider 

161 if endpoint_url: 

162 search_config["openai_endpoint_url"] = endpoint_url 

163 

164 # Check settings for additional configuration 

165 if env_model := get_setting_from_snapshot("llm.model"): 

166 search_config["model_name"] = env_model 

167 if env_provider := get_setting_from_snapshot("llm.provider"): 

168 search_config["provider"] = env_provider 

169 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

170 search_config["openai_endpoint_url"] = env_url 

171 

172 # Set up evaluation configuration if needed 

173 evaluation_config = None 

174 if evaluation_model or evaluation_provider: 

175 evaluation_config = { 

176 "temperature": 0 # Always use zero temperature for evaluation 

177 } 

178 if evaluation_model: 

179 evaluation_config["model_name"] = evaluation_model 

180 if evaluation_provider: 

181 evaluation_config["provider"] = evaluation_provider 

182 # Add endpoint URL if using openai_endpoint 

183 if evaluation_provider == "openai_endpoint" and endpoint_url: 

184 evaluation_config["openai_endpoint_url"] = endpoint_url 

185 elif evaluation_provider == "openai_endpoint" and env_url: 

186 evaluation_config["openai_endpoint_url"] = env_url 

187 

188 # Run the benchmark 

189 results = run_browsecomp_benchmark( 

190 num_examples=num_examples, 

191 output_dir=output_dir, 

192 search_config=search_config, 

193 evaluation_config=evaluation_config, 

194 human_evaluation=human_evaluation, 

195 ) 

196 

197 return results 

198 

199 

200def evaluate_xbench_deepsearch( 

201 num_examples: int = 100, 

202 search_iterations: int = 4, 

203 questions_per_iteration: int = 3, 

204 search_tool: str = "searxng", 

205 human_evaluation: bool = False, 

206 evaluation_model: Optional[str] = None, 

207 evaluation_provider: Optional[str] = None, 

208 output_dir: str = "benchmark_results", 

209 search_model: Optional[str] = None, 

210 search_provider: Optional[str] = None, 

211 endpoint_url: Optional[str] = None, 

212 search_strategy: str = "source_based", 

213) -> Dict[str, Any]: 

214 """ 

215 Run xbench-DeepSearch benchmark evaluation. 

216 

217 Args: 

218 num_examples: Number of examples to evaluate (default 100 - full dataset) 

219 search_iterations: Number of search iterations per query 

220 questions_per_iteration: Number of questions per iteration 

221 search_tool: Search engine to use 

222 human_evaluation: Whether to use human evaluation 

223 evaluation_model: Optional custom model for evaluation 

224 evaluation_provider: Optional custom provider for evaluation 

225 output_dir: Directory to save results 

226 search_model: Optional model to use for the search system 

227 search_provider: Optional provider to use for the search system 

228 endpoint_url: Optional endpoint URL for API services 

229 search_strategy: Search strategy to use 

230 

231 Returns: 

232 Dictionary with benchmark results 

233 """ 

234 logger.info( 

235 f"Starting xbench-DeepSearch benchmark with {num_examples} examples" 

236 ) 

237 

238 # Set up search configuration 

239 search_config = { 

240 "iterations": search_iterations, 

241 "questions_per_iteration": questions_per_iteration, 

242 "search_tool": search_tool, 

243 "search_strategy": search_strategy, 

244 } 

245 

246 # Add model configurations if provided 

247 if search_model: 

248 search_config["model_name"] = search_model 

249 if search_provider: 

250 search_config["provider"] = search_provider 

251 if endpoint_url: 

252 search_config["openai_endpoint_url"] = endpoint_url 

253 

254 # Check settings for additional configuration 

255 if env_model := get_setting_from_snapshot("llm.model"): 

256 search_config["model_name"] = env_model 

257 if env_provider := get_setting_from_snapshot("llm.provider"): 

258 search_config["provider"] = env_provider 

259 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"): 

260 search_config["openai_endpoint_url"] = env_url 

261 

262 # Set up evaluation configuration if needed 

263 evaluation_config = None 

264 if evaluation_model or evaluation_provider: 

265 evaluation_config = { 

266 "temperature": 0 # Always use zero temperature for evaluation 

267 } 

268 if evaluation_model: 

269 evaluation_config["model_name"] = evaluation_model 

270 if evaluation_provider: 

271 evaluation_config["provider"] = evaluation_provider 

272 # Add endpoint URL if using openai_endpoint 

273 if evaluation_provider == "openai_endpoint" and endpoint_url: 

274 evaluation_config["openai_endpoint_url"] = endpoint_url 

275 elif evaluation_provider == "openai_endpoint" and env_url: 

276 evaluation_config["openai_endpoint_url"] = env_url 

277 

278 # Run the benchmark 

279 results = run_xbench_deepsearch_benchmark( 

280 num_examples=num_examples, 

281 output_dir=output_dir, 

282 search_config=search_config, 

283 evaluation_config=evaluation_config, 

284 human_evaluation=human_evaluation, 

285 ) 

286 

287 return results 

288 

289 

290def get_available_benchmarks() -> List[Dict[str, str]]: 

291 """ 

292 Get information about available benchmarks. 

293 

294 Returns: 

295 List of dictionaries with benchmark information 

296 """ 

297 return [ 

298 { 

299 "id": "simpleqa", 

300 "name": "SimpleQA", 

301 "description": "Benchmark for factual question answering", 

302 "recommended_examples": 100, 

303 }, 

304 { 

305 "id": "browsecomp", 

306 "name": "BrowseComp", 

307 "description": "Benchmark for web browsing comprehension", 

308 "recommended_examples": 100, 

309 }, 

310 { 

311 "id": "xbench_deepsearch", 

312 "name": "xbench-DeepSearch", 

313 "description": "Deep research and search capability evaluation", 

314 "recommended_examples": 100, 

315 }, 

316 ] 

317 

318 

319def compare_configurations( 

320 dataset_type: str = "simpleqa", 

321 num_examples: int = 20, 

322 configurations: List[Dict[str, Any]] = None, 

323 output_dir: str = "benchmark_comparisons", 

324) -> Dict[str, Any]: 

325 """ 

326 Compare multiple search configurations on the same benchmark. 

327 

328 Args: 

329 dataset_type: Type of dataset to use 

330 num_examples: Number of examples to evaluate 

331 configurations: List of search configurations to compare 

332 output_dir: Directory to save results 

333 

334 Returns: 

335 Dictionary with comparison results 

336 """ 

337 if not configurations: 

338 # Default configurations to compare 

339 configurations = [ 

340 { 

341 "name": "Base Config", 

342 "search_tool": "searxng", 

343 "iterations": 1, 

344 "questions_per_iteration": 3, 

345 }, 

346 { 

347 "name": "More Iterations", 

348 "search_tool": "searxng", 

349 "iterations": 3, 

350 "questions_per_iteration": 3, 

351 }, 

352 { 

353 "name": "More Questions", 

354 "search_tool": "searxng", 

355 "iterations": 1, 

356 "questions_per_iteration": 5, 

357 }, 

358 ] 

359 

360 # Create output directory 

361 

362 Path(output_dir).mkdir(parents=True, exist_ok=True) 

363 

364 # Run benchmarks for each configuration 

365 results = [] 

366 for config in configurations: 

367 config_name = config.pop("name", f"Config-{len(results)}") 

368 

369 logger.info(f"Running benchmark with configuration: {config_name}") 

370 

371 search_config = { 

372 "iterations": config.pop("iterations", 1), 

373 "questions_per_iteration": config.pop("questions_per_iteration", 3), 

374 "search_tool": config.pop("search_tool", "searxng"), 

375 } 

376 

377 # Add any remaining config items 

378 for key, value in config.items(): 

379 search_config[key] = value 

380 

381 # Run benchmark with this configuration 

382 benchmark_result = run_benchmark( 

383 dataset_type=dataset_type, 

384 num_examples=num_examples, 

385 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")), 

386 search_config=search_config, 

387 run_evaluation=True, 

388 ) 

389 

390 # Add configuration name to results 

391 benchmark_result["configuration_name"] = config_name 

392 benchmark_result["search_config"] = search_config 

393 

394 results.append(benchmark_result) 

395 

396 # Generate comparison report 

397 import time 

398 

399 from ..security.file_write_verifier import write_file_verified 

400 

401 timestamp = time.strftime("%Y%m%d_%H%M%S") 

402 report_file = str( 

403 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md" 

404 ) 

405 

406 # Build report content 

407 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n" 

408 

409 # Write summary table 

410 content += "## Summary\n\n" 

411 content += "| Configuration | Accuracy | Avg. Time | Examples |\n" 

412 content += "|---------------|----------|-----------|----------|\n" 

413 

414 for result in results: 

415 accuracy = result.get("metrics", {}).get("accuracy", 0) 

416 avg_time = result.get("metrics", {}).get("average_processing_time", 0) 

417 examples = result.get("total_examples", 0) 

418 

419 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n" 

420 

421 content += "\n## Configuration Details\n\n" 

422 

423 for result in results: 

424 content += f"### {result['configuration_name']}\n\n" 

425 

426 config = result.get("search_config", {}) 

427 content += "```\n" 

428 for key, value in config.items(): 

429 content += f"{key}: {value}\n" 

430 content += "```\n\n" 

431 

432 write_file_verified( 

433 report_file, 

434 content, 

435 "benchmark.allow_file_output", 

436 context="benchmark comparison report", 

437 ) 

438 

439 logger.info(f"Comparison report saved to {report_file}") 

440 

441 return { 

442 "status": "complete", 

443 "dataset_type": dataset_type, 

444 "configurations_tested": len(configurations), 

445 "report_path": report_file, 

446 "results": results, 

447 } 

448 

449 

450# Export the API functions 

451__all__ = [ 

452 "calculate_metrics", 

453 "compare_configurations", 

454 "evaluate_browsecomp", 

455 "evaluate_simpleqa", 

456 "generate_report", 

457 "get_available_benchmarks", 

458 "run_benchmark", # For advanced users 

459]