Coverage for src / local_deep_research / api / benchmark_functions.py: 100%
79 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2API functions for benchmarking.
4This module provides functions for running benchmarks programmatically.
5"""
7from loguru import logger
8from pathlib import Path
9from typing import Any
11from ..benchmarks import (
12 calculate_metrics,
13 generate_report,
14 run_benchmark,
15 run_browsecomp_benchmark,
16 run_simpleqa_benchmark,
17 run_xbench_deepsearch_benchmark,
18)
21def evaluate_simpleqa(
22 num_examples: int = 100,
23 search_iterations: int = 3,
24 questions_per_iteration: int = 3,
25 search_tool: str = "searxng",
26 human_evaluation: bool = False,
27 evaluation_model: str | None = None,
28 evaluation_provider: str | None = None,
29 output_dir: str = "benchmark_results",
30) -> dict[str, Any]:
31 """
32 Run SimpleQA benchmark evaluation.
34 Args:
35 num_examples: Number of examples to evaluate
36 search_iterations: Number of search iterations per query
37 questions_per_iteration: Number of questions per iteration
38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
39 human_evaluation: Whether to use human evaluation
40 evaluation_model: Optional custom model for evaluation
41 evaluation_provider: Optional custom provider for evaluation
42 output_dir: Directory to save results
44 Returns:
45 Dictionary with benchmark results
46 """
47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
49 # Set up search configuration
50 search_config = {
51 "iterations": search_iterations,
52 "questions_per_iteration": questions_per_iteration,
53 "search_tool": search_tool,
54 }
56 # Set up evaluation configuration if needed
57 evaluation_config = None
58 if evaluation_model or evaluation_provider:
59 evaluation_config = {}
60 if evaluation_model:
61 evaluation_config["model_name"] = evaluation_model
62 if evaluation_provider:
63 evaluation_config["provider"] = evaluation_provider
65 # Run the benchmark
66 return run_simpleqa_benchmark(
67 num_examples=num_examples,
68 output_dir=output_dir,
69 search_config=search_config,
70 evaluation_config=evaluation_config,
71 human_evaluation=human_evaluation,
72 )
75def evaluate_browsecomp(
76 num_examples: int = 100,
77 search_iterations: int = 3,
78 questions_per_iteration: int = 3,
79 search_tool: str = "searxng",
80 human_evaluation: bool = False,
81 evaluation_model: str | None = None,
82 evaluation_provider: str | None = None,
83 output_dir: str = "benchmark_results",
84) -> dict[str, Any]:
85 """
86 Run BrowseComp benchmark evaluation.
88 Args:
89 num_examples: Number of examples to evaluate
90 search_iterations: Number of search iterations per query
91 questions_per_iteration: Number of questions per iteration
92 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
93 human_evaluation: Whether to use human evaluation
94 evaluation_model: Optional custom model for evaluation
95 evaluation_provider: Optional custom provider for evaluation
96 output_dir: Directory to save results
98 Returns:
99 Dictionary with benchmark results
100 """
101 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
103 # Set up search configuration
104 search_config = {
105 "iterations": search_iterations,
106 "questions_per_iteration": questions_per_iteration,
107 "search_tool": search_tool,
108 }
110 # Set up evaluation configuration if needed
111 evaluation_config = None
112 if evaluation_model or evaluation_provider:
113 evaluation_config = {}
114 if evaluation_model:
115 evaluation_config["model_name"] = evaluation_model
116 if evaluation_provider:
117 evaluation_config["provider"] = evaluation_provider
119 # Run the benchmark
120 return run_browsecomp_benchmark(
121 num_examples=num_examples,
122 output_dir=output_dir,
123 search_config=search_config,
124 evaluation_config=evaluation_config,
125 human_evaluation=human_evaluation,
126 )
129def evaluate_xbench_deepsearch(
130 num_examples: int = 100,
131 search_iterations: int = 4,
132 questions_per_iteration: int = 3,
133 search_tool: str = "searxng",
134 human_evaluation: bool = False,
135 evaluation_model: str | None = None,
136 evaluation_provider: str | None = None,
137 output_dir: str = "benchmark_results",
138) -> dict[str, Any]:
139 """
140 Run xbench-DeepSearch benchmark evaluation.
142 Args:
143 num_examples: Number of examples to evaluate (default 100 - full dataset)
144 search_iterations: Number of search iterations per query
145 questions_per_iteration: Number of questions per iteration
146 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
147 human_evaluation: Whether to use human evaluation
148 evaluation_model: Optional custom model for evaluation
149 evaluation_provider: Optional custom provider for evaluation
150 output_dir: Directory to save results
152 Returns:
153 Dictionary with benchmark results
154 """
155 logger.info(
156 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"
157 )
159 # Set up search configuration
160 search_config = {
161 "iterations": search_iterations,
162 "questions_per_iteration": questions_per_iteration,
163 "search_tool": search_tool,
164 }
166 # Set up evaluation configuration if needed
167 evaluation_config = None
168 if evaluation_model or evaluation_provider:
169 evaluation_config = {}
170 if evaluation_model:
171 evaluation_config["model_name"] = evaluation_model
172 if evaluation_provider:
173 evaluation_config["provider"] = evaluation_provider
175 # Run the benchmark
176 return run_xbench_deepsearch_benchmark(
177 num_examples=num_examples,
178 output_dir=output_dir,
179 search_config=search_config,
180 evaluation_config=evaluation_config,
181 human_evaluation=human_evaluation,
182 )
185def get_available_benchmarks() -> list[dict[str, str]]:
186 """
187 Get information about available benchmarks.
189 Returns:
190 List of dictionaries with benchmark information
191 """
192 return [
193 {
194 "id": "simpleqa",
195 "name": "SimpleQA",
196 "description": "Benchmark for factual question answering",
197 "recommended_examples": 100,
198 },
199 {
200 "id": "browsecomp",
201 "name": "BrowseComp",
202 "description": "Benchmark for web browsing comprehension",
203 "recommended_examples": 100,
204 },
205 {
206 "id": "xbench_deepsearch",
207 "name": "xbench-DeepSearch",
208 "description": "Benchmark for deep search and investigation queries",
209 "recommended_examples": 100,
210 },
211 ]
214def compare_configurations(
215 dataset_type: str = "simpleqa",
216 num_examples: int = 20,
217 configurations: list[dict[str, Any]] = None,
218 output_dir: str = "benchmark_comparisons",
219) -> dict[str, Any]:
220 """
221 Compare multiple search configurations on the same benchmark.
223 Args:
224 dataset_type: Type of dataset to use
225 num_examples: Number of examples to evaluate
226 configurations: List of search configurations to compare
227 output_dir: Directory to save results
229 Returns:
230 Dictionary with comparison results
231 """
232 if not configurations:
233 # Default configurations to compare
234 configurations = [
235 {
236 "name": "Base Config",
237 "search_tool": "searxng",
238 "iterations": 1,
239 "questions_per_iteration": 3,
240 },
241 {
242 "name": "More Iterations",
243 "search_tool": "searxng",
244 "iterations": 3,
245 "questions_per_iteration": 3,
246 },
247 {
248 "name": "More Questions",
249 "search_tool": "searxng",
250 "iterations": 1,
251 "questions_per_iteration": 5,
252 },
253 ]
255 # Create output directory
256 import os
258 os.makedirs(output_dir, exist_ok=True)
260 # Run benchmarks for each configuration
261 results = []
262 for config in configurations:
263 config_name = config.pop("name", f"Config-{len(results)}")
265 logger.info(f"Running benchmark with configuration: {config_name}")
267 search_config = {
268 "iterations": config.pop("iterations", 1),
269 "questions_per_iteration": config.pop("questions_per_iteration", 3),
270 "search_tool": config.pop("search_tool", "searxng"),
271 }
273 # Add any remaining config items
274 search_config.update(config)
276 # Run benchmark with this configuration
277 benchmark_result = run_benchmark(
278 dataset_type=dataset_type,
279 num_examples=num_examples,
280 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),
281 search_config=search_config,
282 run_evaluation=True,
283 )
285 # Add configuration name to results
286 benchmark_result["configuration_name"] = config_name
287 benchmark_result["search_config"] = search_config
289 results.append(benchmark_result)
291 # Generate comparison report
292 import time
293 from ..security.file_write_verifier import write_file_verified
295 timestamp = time.strftime("%Y%m%d_%H%M%S")
296 report_file = str(
297 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"
298 )
300 # Build report content
301 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"
303 # Write summary table
304 content += "## Summary\n\n"
305 content += "| Configuration | Accuracy | Avg. Time | Examples |\n"
306 content += "|---------------|----------|-----------|----------|\n"
308 for result in results:
309 accuracy = result.get("metrics", {}).get("accuracy", 0)
310 avg_time = result.get("metrics", {}).get("average_processing_time", 0)
311 examples = result.get("total_examples", 0)
313 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
315 content += "\n## Configuration Details\n\n"
317 for result in results:
318 content += f"### {result['configuration_name']}\n\n"
320 config = result.get("search_config", {})
321 content += "```\n"
322 for key, value in config.items():
323 content += f"{key}: {value}\n"
324 content += "```\n\n"
326 write_file_verified(
327 report_file,
328 content,
329 "benchmark.allow_file_output",
330 context="benchmark comparison report",
331 )
333 logger.info(f"Comparison report saved to {report_file}")
335 return {
336 "status": "complete",
337 "dataset_type": dataset_type,
338 "configurations_tested": len(configurations),
339 "report_path": report_file,
340 "results": results,
341 }
344# Export the API functions
345__all__ = [
346 "calculate_metrics",
347 "compare_configurations",
348 "evaluate_browsecomp",
349 "evaluate_simpleqa",
350 "evaluate_xbench_deepsearch",
351 "generate_report",
352 "get_available_benchmarks",
353 "run_benchmark", # For advanced users
354]