Coverage for src / local_deep_research / api / benchmark_functions.py: 0%

83 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2API functions for benchmarking. 

3 

4This module provides functions for running benchmarks programmatically. 

5""" 

6 

7from loguru import logger 

8from pathlib import Path 

9from typing import Any, Dict, List, Optional 

10 

11from ..benchmarks import ( 

12 calculate_metrics, 

13 generate_report, 

14 run_benchmark, 

15 run_browsecomp_benchmark, 

16 run_simpleqa_benchmark, 

17 run_xbench_deepsearch_benchmark, 

18) 

19 

20 

21def evaluate_simpleqa( 

22 num_examples: int = 100, 

23 search_iterations: int = 3, 

24 questions_per_iteration: int = 3, 

25 search_tool: str = "searxng", 

26 human_evaluation: bool = False, 

27 evaluation_model: Optional[str] = None, 

28 evaluation_provider: Optional[str] = None, 

29 output_dir: str = "benchmark_results", 

30) -> Dict[str, Any]: 

31 """ 

32 Run SimpleQA benchmark evaluation. 

33 

34 Args: 

35 num_examples: Number of examples to evaluate 

36 search_iterations: Number of search iterations per query 

37 questions_per_iteration: Number of questions per iteration 

38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

39 human_evaluation: Whether to use human evaluation 

40 evaluation_model: Optional custom model for evaluation 

41 evaluation_provider: Optional custom provider for evaluation 

42 output_dir: Directory to save results 

43 

44 Returns: 

45 Dictionary with benchmark results 

46 """ 

47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples") 

48 

49 # Set up search configuration 

50 search_config = { 

51 "iterations": search_iterations, 

52 "questions_per_iteration": questions_per_iteration, 

53 "search_tool": search_tool, 

54 } 

55 

56 # Set up evaluation configuration if needed 

57 evaluation_config = None 

58 if evaluation_model or evaluation_provider: 

59 evaluation_config = {} 

60 if evaluation_model: 

61 evaluation_config["model_name"] = evaluation_model 

62 if evaluation_provider: 

63 evaluation_config["provider"] = evaluation_provider 

64 

65 # Run the benchmark 

66 results = run_simpleqa_benchmark( 

67 num_examples=num_examples, 

68 output_dir=output_dir, 

69 search_config=search_config, 

70 evaluation_config=evaluation_config, 

71 human_evaluation=human_evaluation, 

72 ) 

73 

74 return results 

75 

76 

77def evaluate_browsecomp( 

78 num_examples: int = 100, 

79 search_iterations: int = 3, 

80 questions_per_iteration: int = 3, 

81 search_tool: str = "searxng", 

82 human_evaluation: bool = False, 

83 evaluation_model: Optional[str] = None, 

84 evaluation_provider: Optional[str] = None, 

85 output_dir: str = "benchmark_results", 

86) -> Dict[str, Any]: 

87 """ 

88 Run BrowseComp benchmark evaluation. 

89 

90 Args: 

91 num_examples: Number of examples to evaluate 

92 search_iterations: Number of search iterations per query 

93 questions_per_iteration: Number of questions per iteration 

94 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

95 human_evaluation: Whether to use human evaluation 

96 evaluation_model: Optional custom model for evaluation 

97 evaluation_provider: Optional custom provider for evaluation 

98 output_dir: Directory to save results 

99 

100 Returns: 

101 Dictionary with benchmark results 

102 """ 

103 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples") 

104 

105 # Set up search configuration 

106 search_config = { 

107 "iterations": search_iterations, 

108 "questions_per_iteration": questions_per_iteration, 

109 "search_tool": search_tool, 

110 } 

111 

112 # Set up evaluation configuration if needed 

113 evaluation_config = None 

114 if evaluation_model or evaluation_provider: 

115 evaluation_config = {} 

116 if evaluation_model: 

117 evaluation_config["model_name"] = evaluation_model 

118 if evaluation_provider: 

119 evaluation_config["provider"] = evaluation_provider 

120 

121 # Run the benchmark 

122 results = run_browsecomp_benchmark( 

123 num_examples=num_examples, 

124 output_dir=output_dir, 

125 search_config=search_config, 

126 evaluation_config=evaluation_config, 

127 human_evaluation=human_evaluation, 

128 ) 

129 

130 return results 

131 

132 

133def evaluate_xbench_deepsearch( 

134 num_examples: int = 100, 

135 search_iterations: int = 4, 

136 questions_per_iteration: int = 3, 

137 search_tool: str = "searxng", 

138 human_evaluation: bool = False, 

139 evaluation_model: Optional[str] = None, 

140 evaluation_provider: Optional[str] = None, 

141 output_dir: str = "benchmark_results", 

142) -> Dict[str, Any]: 

143 """ 

144 Run xbench-DeepSearch benchmark evaluation. 

145 

146 Args: 

147 num_examples: Number of examples to evaluate (default 100 - full dataset) 

148 search_iterations: Number of search iterations per query 

149 questions_per_iteration: Number of questions per iteration 

150 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

151 human_evaluation: Whether to use human evaluation 

152 evaluation_model: Optional custom model for evaluation 

153 evaluation_provider: Optional custom provider for evaluation 

154 output_dir: Directory to save results 

155 

156 Returns: 

157 Dictionary with benchmark results 

158 """ 

159 logger.info( 

160 f"Starting xbench-DeepSearch benchmark with {num_examples} examples" 

161 ) 

162 

163 # Set up search configuration 

164 search_config = { 

165 "iterations": search_iterations, 

166 "questions_per_iteration": questions_per_iteration, 

167 "search_tool": search_tool, 

168 } 

169 

170 # Set up evaluation configuration if needed 

171 evaluation_config = None 

172 if evaluation_model or evaluation_provider: 

173 evaluation_config = {} 

174 if evaluation_model: 

175 evaluation_config["model_name"] = evaluation_model 

176 if evaluation_provider: 

177 evaluation_config["provider"] = evaluation_provider 

178 

179 # Run the benchmark 

180 results = run_xbench_deepsearch_benchmark( 

181 num_examples=num_examples, 

182 output_dir=output_dir, 

183 search_config=search_config, 

184 evaluation_config=evaluation_config, 

185 human_evaluation=human_evaluation, 

186 ) 

187 

188 return results 

189 

190 

191def get_available_benchmarks() -> List[Dict[str, str]]: 

192 """ 

193 Get information about available benchmarks. 

194 

195 Returns: 

196 List of dictionaries with benchmark information 

197 """ 

198 return [ 

199 { 

200 "id": "simpleqa", 

201 "name": "SimpleQA", 

202 "description": "Benchmark for factual question answering", 

203 "recommended_examples": 100, 

204 }, 

205 { 

206 "id": "browsecomp", 

207 "name": "BrowseComp", 

208 "description": "Benchmark for web browsing comprehension", 

209 "recommended_examples": 100, 

210 }, 

211 { 

212 "id": "xbench_deepsearch", 

213 "name": "xbench-DeepSearch", 

214 "description": "Benchmark for deep search and investigation queries", 

215 "recommended_examples": 100, 

216 }, 

217 ] 

218 

219 

220def compare_configurations( 

221 dataset_type: str = "simpleqa", 

222 num_examples: int = 20, 

223 configurations: List[Dict[str, Any]] = None, 

224 output_dir: str = "benchmark_comparisons", 

225) -> Dict[str, Any]: 

226 """ 

227 Compare multiple search configurations on the same benchmark. 

228 

229 Args: 

230 dataset_type: Type of dataset to use 

231 num_examples: Number of examples to evaluate 

232 configurations: List of search configurations to compare 

233 output_dir: Directory to save results 

234 

235 Returns: 

236 Dictionary with comparison results 

237 """ 

238 if not configurations: 

239 # Default configurations to compare 

240 configurations = [ 

241 { 

242 "name": "Base Config", 

243 "search_tool": "searxng", 

244 "iterations": 1, 

245 "questions_per_iteration": 3, 

246 }, 

247 { 

248 "name": "More Iterations", 

249 "search_tool": "searxng", 

250 "iterations": 3, 

251 "questions_per_iteration": 3, 

252 }, 

253 { 

254 "name": "More Questions", 

255 "search_tool": "searxng", 

256 "iterations": 1, 

257 "questions_per_iteration": 5, 

258 }, 

259 ] 

260 

261 # Create output directory 

262 import os 

263 

264 os.makedirs(output_dir, exist_ok=True) 

265 

266 # Run benchmarks for each configuration 

267 results = [] 

268 for config in configurations: 

269 config_name = config.pop("name", f"Config-{len(results)}") 

270 

271 logger.info(f"Running benchmark with configuration: {config_name}") 

272 

273 search_config = { 

274 "iterations": config.pop("iterations", 1), 

275 "questions_per_iteration": config.pop("questions_per_iteration", 3), 

276 "search_tool": config.pop("search_tool", "searxng"), 

277 } 

278 

279 # Add any remaining config items 

280 for key, value in config.items(): 

281 search_config[key] = value 

282 

283 # Run benchmark with this configuration 

284 benchmark_result = run_benchmark( 

285 dataset_type=dataset_type, 

286 num_examples=num_examples, 

287 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")), 

288 search_config=search_config, 

289 run_evaluation=True, 

290 ) 

291 

292 # Add configuration name to results 

293 benchmark_result["configuration_name"] = config_name 

294 benchmark_result["search_config"] = search_config 

295 

296 results.append(benchmark_result) 

297 

298 # Generate comparison report 

299 import time 

300 from ...security.file_write_verifier import write_file_verified 

301 

302 timestamp = time.strftime("%Y%m%d_%H%M%S") 

303 report_file = str( 

304 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md" 

305 ) 

306 

307 # Build report content 

308 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n" 

309 

310 # Write summary table 

311 content += "## Summary\n\n" 

312 content += "| Configuration | Accuracy | Avg. Time | Examples |\n" 

313 content += "|---------------|----------|-----------|----------|\n" 

314 

315 for result in results: 

316 accuracy = result.get("metrics", {}).get("accuracy", 0) 

317 avg_time = result.get("metrics", {}).get("average_processing_time", 0) 

318 examples = result.get("total_examples", 0) 

319 

320 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n" 

321 

322 content += "\n## Configuration Details\n\n" 

323 

324 for result in results: 

325 content += f"### {result['configuration_name']}\n\n" 

326 

327 config = result.get("search_config", {}) 

328 content += "```\n" 

329 for key, value in config.items(): 

330 content += f"{key}: {value}\n" 

331 content += "```\n\n" 

332 

333 write_file_verified( 

334 report_file, 

335 content, 

336 "benchmark.allow_file_output", 

337 context="benchmark comparison report", 

338 ) 

339 

340 logger.info(f"Comparison report saved to {report_file}") 

341 

342 return { 

343 "status": "complete", 

344 "dataset_type": dataset_type, 

345 "configurations_tested": len(configurations), 

346 "report_path": report_file, 

347 "results": results, 

348 } 

349 

350 

351# Export the API functions 

352__all__ = [ 

353 "calculate_metrics", 

354 "compare_configurations", 

355 "evaluate_browsecomp", 

356 "evaluate_simpleqa", 

357 "evaluate_xbench_deepsearch", 

358 "generate_report", 

359 "get_available_benchmarks", 

360 "run_benchmark", # For advanced users 

361]