Coverage for src / local_deep_research / api / benchmark_functions.py: 100%

79 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2API functions for benchmarking. 

3 

4This module provides functions for running benchmarks programmatically. 

5""" 

6 

7from loguru import logger 

8from pathlib import Path 

9from typing import Any 

10 

11from ..benchmarks import ( 

12 calculate_metrics, 

13 generate_report, 

14 run_benchmark, 

15 run_browsecomp_benchmark, 

16 run_simpleqa_benchmark, 

17 run_xbench_deepsearch_benchmark, 

18) 

19 

20 

21def evaluate_simpleqa( 

22 num_examples: int = 100, 

23 search_iterations: int = 3, 

24 questions_per_iteration: int = 3, 

25 search_tool: str = "searxng", 

26 human_evaluation: bool = False, 

27 evaluation_model: str | None = None, 

28 evaluation_provider: str | None = None, 

29 output_dir: str = "benchmark_results", 

30) -> dict[str, Any]: 

31 """ 

32 Run SimpleQA benchmark evaluation. 

33 

34 Args: 

35 num_examples: Number of examples to evaluate 

36 search_iterations: Number of search iterations per query 

37 questions_per_iteration: Number of questions per iteration 

38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

39 human_evaluation: Whether to use human evaluation 

40 evaluation_model: Optional custom model for evaluation 

41 evaluation_provider: Optional custom provider for evaluation 

42 output_dir: Directory to save results 

43 

44 Returns: 

45 Dictionary with benchmark results 

46 """ 

47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples") 

48 

49 # Set up search configuration 

50 search_config = { 

51 "iterations": search_iterations, 

52 "questions_per_iteration": questions_per_iteration, 

53 "search_tool": search_tool, 

54 } 

55 

56 # Set up evaluation configuration if needed 

57 evaluation_config = None 

58 if evaluation_model or evaluation_provider: 

59 evaluation_config = {} 

60 if evaluation_model: 

61 evaluation_config["model_name"] = evaluation_model 

62 if evaluation_provider: 

63 evaluation_config["provider"] = evaluation_provider 

64 

65 # Run the benchmark 

66 return run_simpleqa_benchmark( 

67 num_examples=num_examples, 

68 output_dir=output_dir, 

69 search_config=search_config, 

70 evaluation_config=evaluation_config, 

71 human_evaluation=human_evaluation, 

72 ) 

73 

74 

75def evaluate_browsecomp( 

76 num_examples: int = 100, 

77 search_iterations: int = 3, 

78 questions_per_iteration: int = 3, 

79 search_tool: str = "searxng", 

80 human_evaluation: bool = False, 

81 evaluation_model: str | None = None, 

82 evaluation_provider: str | None = None, 

83 output_dir: str = "benchmark_results", 

84) -> dict[str, Any]: 

85 """ 

86 Run BrowseComp benchmark evaluation. 

87 

88 Args: 

89 num_examples: Number of examples to evaluate 

90 search_iterations: Number of search iterations per query 

91 questions_per_iteration: Number of questions per iteration 

92 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

93 human_evaluation: Whether to use human evaluation 

94 evaluation_model: Optional custom model for evaluation 

95 evaluation_provider: Optional custom provider for evaluation 

96 output_dir: Directory to save results 

97 

98 Returns: 

99 Dictionary with benchmark results 

100 """ 

101 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples") 

102 

103 # Set up search configuration 

104 search_config = { 

105 "iterations": search_iterations, 

106 "questions_per_iteration": questions_per_iteration, 

107 "search_tool": search_tool, 

108 } 

109 

110 # Set up evaluation configuration if needed 

111 evaluation_config = None 

112 if evaluation_model or evaluation_provider: 

113 evaluation_config = {} 

114 if evaluation_model: 

115 evaluation_config["model_name"] = evaluation_model 

116 if evaluation_provider: 

117 evaluation_config["provider"] = evaluation_provider 

118 

119 # Run the benchmark 

120 return run_browsecomp_benchmark( 

121 num_examples=num_examples, 

122 output_dir=output_dir, 

123 search_config=search_config, 

124 evaluation_config=evaluation_config, 

125 human_evaluation=human_evaluation, 

126 ) 

127 

128 

129def evaluate_xbench_deepsearch( 

130 num_examples: int = 100, 

131 search_iterations: int = 4, 

132 questions_per_iteration: int = 3, 

133 search_tool: str = "searxng", 

134 human_evaluation: bool = False, 

135 evaluation_model: str | None = None, 

136 evaluation_provider: str | None = None, 

137 output_dir: str = "benchmark_results", 

138) -> dict[str, Any]: 

139 """ 

140 Run xbench-DeepSearch benchmark evaluation. 

141 

142 Args: 

143 num_examples: Number of examples to evaluate (default 100 - full dataset) 

144 search_iterations: Number of search iterations per query 

145 questions_per_iteration: Number of questions per iteration 

146 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia') 

147 human_evaluation: Whether to use human evaluation 

148 evaluation_model: Optional custom model for evaluation 

149 evaluation_provider: Optional custom provider for evaluation 

150 output_dir: Directory to save results 

151 

152 Returns: 

153 Dictionary with benchmark results 

154 """ 

155 logger.info( 

156 f"Starting xbench-DeepSearch benchmark with {num_examples} examples" 

157 ) 

158 

159 # Set up search configuration 

160 search_config = { 

161 "iterations": search_iterations, 

162 "questions_per_iteration": questions_per_iteration, 

163 "search_tool": search_tool, 

164 } 

165 

166 # Set up evaluation configuration if needed 

167 evaluation_config = None 

168 if evaluation_model or evaluation_provider: 

169 evaluation_config = {} 

170 if evaluation_model: 

171 evaluation_config["model_name"] = evaluation_model 

172 if evaluation_provider: 

173 evaluation_config["provider"] = evaluation_provider 

174 

175 # Run the benchmark 

176 return run_xbench_deepsearch_benchmark( 

177 num_examples=num_examples, 

178 output_dir=output_dir, 

179 search_config=search_config, 

180 evaluation_config=evaluation_config, 

181 human_evaluation=human_evaluation, 

182 ) 

183 

184 

185def get_available_benchmarks() -> list[dict[str, str]]: 

186 """ 

187 Get information about available benchmarks. 

188 

189 Returns: 

190 List of dictionaries with benchmark information 

191 """ 

192 return [ 

193 { 

194 "id": "simpleqa", 

195 "name": "SimpleQA", 

196 "description": "Benchmark for factual question answering", 

197 "recommended_examples": 100, 

198 }, 

199 { 

200 "id": "browsecomp", 

201 "name": "BrowseComp", 

202 "description": "Benchmark for web browsing comprehension", 

203 "recommended_examples": 100, 

204 }, 

205 { 

206 "id": "xbench_deepsearch", 

207 "name": "xbench-DeepSearch", 

208 "description": "Benchmark for deep search and investigation queries", 

209 "recommended_examples": 100, 

210 }, 

211 ] 

212 

213 

214def compare_configurations( 

215 dataset_type: str = "simpleqa", 

216 num_examples: int = 20, 

217 configurations: list[dict[str, Any]] = None, 

218 output_dir: str = "benchmark_comparisons", 

219) -> dict[str, Any]: 

220 """ 

221 Compare multiple search configurations on the same benchmark. 

222 

223 Args: 

224 dataset_type: Type of dataset to use 

225 num_examples: Number of examples to evaluate 

226 configurations: List of search configurations to compare 

227 output_dir: Directory to save results 

228 

229 Returns: 

230 Dictionary with comparison results 

231 """ 

232 if not configurations: 

233 # Default configurations to compare 

234 configurations = [ 

235 { 

236 "name": "Base Config", 

237 "search_tool": "searxng", 

238 "iterations": 1, 

239 "questions_per_iteration": 3, 

240 }, 

241 { 

242 "name": "More Iterations", 

243 "search_tool": "searxng", 

244 "iterations": 3, 

245 "questions_per_iteration": 3, 

246 }, 

247 { 

248 "name": "More Questions", 

249 "search_tool": "searxng", 

250 "iterations": 1, 

251 "questions_per_iteration": 5, 

252 }, 

253 ] 

254 

255 # Create output directory 

256 import os 

257 

258 os.makedirs(output_dir, exist_ok=True) 

259 

260 # Run benchmarks for each configuration 

261 results = [] 

262 for config in configurations: 

263 config_name = config.pop("name", f"Config-{len(results)}") 

264 

265 logger.info(f"Running benchmark with configuration: {config_name}") 

266 

267 search_config = { 

268 "iterations": config.pop("iterations", 1), 

269 "questions_per_iteration": config.pop("questions_per_iteration", 3), 

270 "search_tool": config.pop("search_tool", "searxng"), 

271 } 

272 

273 # Add any remaining config items 

274 search_config.update(config) 

275 

276 # Run benchmark with this configuration 

277 benchmark_result = run_benchmark( 

278 dataset_type=dataset_type, 

279 num_examples=num_examples, 

280 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")), 

281 search_config=search_config, 

282 run_evaluation=True, 

283 ) 

284 

285 # Add configuration name to results 

286 benchmark_result["configuration_name"] = config_name 

287 benchmark_result["search_config"] = search_config 

288 

289 results.append(benchmark_result) 

290 

291 # Generate comparison report 

292 import time 

293 from ..security.file_write_verifier import write_file_verified 

294 

295 timestamp = time.strftime("%Y%m%d_%H%M%S") 

296 report_file = str( 

297 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md" 

298 ) 

299 

300 # Build report content 

301 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n" 

302 

303 # Write summary table 

304 content += "## Summary\n\n" 

305 content += "| Configuration | Accuracy | Avg. Time | Examples |\n" 

306 content += "|---------------|----------|-----------|----------|\n" 

307 

308 for result in results: 

309 accuracy = result.get("metrics", {}).get("accuracy", 0) 

310 avg_time = result.get("metrics", {}).get("average_processing_time", 0) 

311 examples = result.get("total_examples", 0) 

312 

313 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n" 

314 

315 content += "\n## Configuration Details\n\n" 

316 

317 for result in results: 

318 content += f"### {result['configuration_name']}\n\n" 

319 

320 config = result.get("search_config", {}) 

321 content += "```\n" 

322 for key, value in config.items(): 

323 content += f"{key}: {value}\n" 

324 content += "```\n\n" 

325 

326 write_file_verified( 

327 report_file, 

328 content, 

329 "benchmark.allow_file_output", 

330 context="benchmark comparison report", 

331 ) 

332 

333 logger.info(f"Comparison report saved to {report_file}") 

334 

335 return { 

336 "status": "complete", 

337 "dataset_type": dataset_type, 

338 "configurations_tested": len(configurations), 

339 "report_path": report_file, 

340 "results": results, 

341 } 

342 

343 

344# Export the API functions 

345__all__ = [ 

346 "calculate_metrics", 

347 "compare_configurations", 

348 "evaluate_browsecomp", 

349 "evaluate_simpleqa", 

350 "evaluate_xbench_deepsearch", 

351 "generate_report", 

352 "get_available_benchmarks", 

353 "run_benchmark", # For advanced users 

354]