Coverage for src / local_deep_research / api / benchmark_functions.py: 0%
83 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2API functions for benchmarking.
4This module provides functions for running benchmarks programmatically.
5"""
7from loguru import logger
8from pathlib import Path
9from typing import Any, Dict, List, Optional
11from ..benchmarks import (
12 calculate_metrics,
13 generate_report,
14 run_benchmark,
15 run_browsecomp_benchmark,
16 run_simpleqa_benchmark,
17 run_xbench_deepsearch_benchmark,
18)
21def evaluate_simpleqa(
22 num_examples: int = 100,
23 search_iterations: int = 3,
24 questions_per_iteration: int = 3,
25 search_tool: str = "searxng",
26 human_evaluation: bool = False,
27 evaluation_model: Optional[str] = None,
28 evaluation_provider: Optional[str] = None,
29 output_dir: str = "benchmark_results",
30) -> Dict[str, Any]:
31 """
32 Run SimpleQA benchmark evaluation.
34 Args:
35 num_examples: Number of examples to evaluate
36 search_iterations: Number of search iterations per query
37 questions_per_iteration: Number of questions per iteration
38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
39 human_evaluation: Whether to use human evaluation
40 evaluation_model: Optional custom model for evaluation
41 evaluation_provider: Optional custom provider for evaluation
42 output_dir: Directory to save results
44 Returns:
45 Dictionary with benchmark results
46 """
47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
49 # Set up search configuration
50 search_config = {
51 "iterations": search_iterations,
52 "questions_per_iteration": questions_per_iteration,
53 "search_tool": search_tool,
54 }
56 # Set up evaluation configuration if needed
57 evaluation_config = None
58 if evaluation_model or evaluation_provider:
59 evaluation_config = {}
60 if evaluation_model:
61 evaluation_config["model_name"] = evaluation_model
62 if evaluation_provider:
63 evaluation_config["provider"] = evaluation_provider
65 # Run the benchmark
66 results = run_simpleqa_benchmark(
67 num_examples=num_examples,
68 output_dir=output_dir,
69 search_config=search_config,
70 evaluation_config=evaluation_config,
71 human_evaluation=human_evaluation,
72 )
74 return results
77def evaluate_browsecomp(
78 num_examples: int = 100,
79 search_iterations: int = 3,
80 questions_per_iteration: int = 3,
81 search_tool: str = "searxng",
82 human_evaluation: bool = False,
83 evaluation_model: Optional[str] = None,
84 evaluation_provider: Optional[str] = None,
85 output_dir: str = "benchmark_results",
86) -> Dict[str, Any]:
87 """
88 Run BrowseComp benchmark evaluation.
90 Args:
91 num_examples: Number of examples to evaluate
92 search_iterations: Number of search iterations per query
93 questions_per_iteration: Number of questions per iteration
94 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
95 human_evaluation: Whether to use human evaluation
96 evaluation_model: Optional custom model for evaluation
97 evaluation_provider: Optional custom provider for evaluation
98 output_dir: Directory to save results
100 Returns:
101 Dictionary with benchmark results
102 """
103 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
105 # Set up search configuration
106 search_config = {
107 "iterations": search_iterations,
108 "questions_per_iteration": questions_per_iteration,
109 "search_tool": search_tool,
110 }
112 # Set up evaluation configuration if needed
113 evaluation_config = None
114 if evaluation_model or evaluation_provider:
115 evaluation_config = {}
116 if evaluation_model:
117 evaluation_config["model_name"] = evaluation_model
118 if evaluation_provider:
119 evaluation_config["provider"] = evaluation_provider
121 # Run the benchmark
122 results = run_browsecomp_benchmark(
123 num_examples=num_examples,
124 output_dir=output_dir,
125 search_config=search_config,
126 evaluation_config=evaluation_config,
127 human_evaluation=human_evaluation,
128 )
130 return results
133def evaluate_xbench_deepsearch(
134 num_examples: int = 100,
135 search_iterations: int = 4,
136 questions_per_iteration: int = 3,
137 search_tool: str = "searxng",
138 human_evaluation: bool = False,
139 evaluation_model: Optional[str] = None,
140 evaluation_provider: Optional[str] = None,
141 output_dir: str = "benchmark_results",
142) -> Dict[str, Any]:
143 """
144 Run xbench-DeepSearch benchmark evaluation.
146 Args:
147 num_examples: Number of examples to evaluate (default 100 - full dataset)
148 search_iterations: Number of search iterations per query
149 questions_per_iteration: Number of questions per iteration
150 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
151 human_evaluation: Whether to use human evaluation
152 evaluation_model: Optional custom model for evaluation
153 evaluation_provider: Optional custom provider for evaluation
154 output_dir: Directory to save results
156 Returns:
157 Dictionary with benchmark results
158 """
159 logger.info(
160 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"
161 )
163 # Set up search configuration
164 search_config = {
165 "iterations": search_iterations,
166 "questions_per_iteration": questions_per_iteration,
167 "search_tool": search_tool,
168 }
170 # Set up evaluation configuration if needed
171 evaluation_config = None
172 if evaluation_model or evaluation_provider:
173 evaluation_config = {}
174 if evaluation_model:
175 evaluation_config["model_name"] = evaluation_model
176 if evaluation_provider:
177 evaluation_config["provider"] = evaluation_provider
179 # Run the benchmark
180 results = run_xbench_deepsearch_benchmark(
181 num_examples=num_examples,
182 output_dir=output_dir,
183 search_config=search_config,
184 evaluation_config=evaluation_config,
185 human_evaluation=human_evaluation,
186 )
188 return results
191def get_available_benchmarks() -> List[Dict[str, str]]:
192 """
193 Get information about available benchmarks.
195 Returns:
196 List of dictionaries with benchmark information
197 """
198 return [
199 {
200 "id": "simpleqa",
201 "name": "SimpleQA",
202 "description": "Benchmark for factual question answering",
203 "recommended_examples": 100,
204 },
205 {
206 "id": "browsecomp",
207 "name": "BrowseComp",
208 "description": "Benchmark for web browsing comprehension",
209 "recommended_examples": 100,
210 },
211 {
212 "id": "xbench_deepsearch",
213 "name": "xbench-DeepSearch",
214 "description": "Benchmark for deep search and investigation queries",
215 "recommended_examples": 100,
216 },
217 ]
220def compare_configurations(
221 dataset_type: str = "simpleqa",
222 num_examples: int = 20,
223 configurations: List[Dict[str, Any]] = None,
224 output_dir: str = "benchmark_comparisons",
225) -> Dict[str, Any]:
226 """
227 Compare multiple search configurations on the same benchmark.
229 Args:
230 dataset_type: Type of dataset to use
231 num_examples: Number of examples to evaluate
232 configurations: List of search configurations to compare
233 output_dir: Directory to save results
235 Returns:
236 Dictionary with comparison results
237 """
238 if not configurations:
239 # Default configurations to compare
240 configurations = [
241 {
242 "name": "Base Config",
243 "search_tool": "searxng",
244 "iterations": 1,
245 "questions_per_iteration": 3,
246 },
247 {
248 "name": "More Iterations",
249 "search_tool": "searxng",
250 "iterations": 3,
251 "questions_per_iteration": 3,
252 },
253 {
254 "name": "More Questions",
255 "search_tool": "searxng",
256 "iterations": 1,
257 "questions_per_iteration": 5,
258 },
259 ]
261 # Create output directory
262 import os
264 os.makedirs(output_dir, exist_ok=True)
266 # Run benchmarks for each configuration
267 results = []
268 for config in configurations:
269 config_name = config.pop("name", f"Config-{len(results)}")
271 logger.info(f"Running benchmark with configuration: {config_name}")
273 search_config = {
274 "iterations": config.pop("iterations", 1),
275 "questions_per_iteration": config.pop("questions_per_iteration", 3),
276 "search_tool": config.pop("search_tool", "searxng"),
277 }
279 # Add any remaining config items
280 for key, value in config.items():
281 search_config[key] = value
283 # Run benchmark with this configuration
284 benchmark_result = run_benchmark(
285 dataset_type=dataset_type,
286 num_examples=num_examples,
287 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),
288 search_config=search_config,
289 run_evaluation=True,
290 )
292 # Add configuration name to results
293 benchmark_result["configuration_name"] = config_name
294 benchmark_result["search_config"] = search_config
296 results.append(benchmark_result)
298 # Generate comparison report
299 import time
300 from ...security.file_write_verifier import write_file_verified
302 timestamp = time.strftime("%Y%m%d_%H%M%S")
303 report_file = str(
304 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"
305 )
307 # Build report content
308 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"
310 # Write summary table
311 content += "## Summary\n\n"
312 content += "| Configuration | Accuracy | Avg. Time | Examples |\n"
313 content += "|---------------|----------|-----------|----------|\n"
315 for result in results:
316 accuracy = result.get("metrics", {}).get("accuracy", 0)
317 avg_time = result.get("metrics", {}).get("average_processing_time", 0)
318 examples = result.get("total_examples", 0)
320 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
322 content += "\n## Configuration Details\n\n"
324 for result in results:
325 content += f"### {result['configuration_name']}\n\n"
327 config = result.get("search_config", {})
328 content += "```\n"
329 for key, value in config.items():
330 content += f"{key}: {value}\n"
331 content += "```\n\n"
333 write_file_verified(
334 report_file,
335 content,
336 "benchmark.allow_file_output",
337 context="benchmark comparison report",
338 )
340 logger.info(f"Comparison report saved to {report_file}")
342 return {
343 "status": "complete",
344 "dataset_type": dataset_type,
345 "configurations_tested": len(configurations),
346 "report_path": report_file,
347 "results": results,
348 }
351# Export the API functions
352__all__ = [
353 "calculate_metrics",
354 "compare_configurations",
355 "evaluate_browsecomp",
356 "evaluate_simpleqa",
357 "evaluate_xbench_deepsearch",
358 "generate_report",
359 "get_available_benchmarks",
360 "run_benchmark", # For advanced users
361]