Coverage for src / local_deep_research / api / benchmark_functions.py: 79%
83 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2API functions for benchmarking.
4This module provides functions for running benchmarks programmatically.
5"""
7from loguru import logger
8from pathlib import Path
9from typing import Any
11from ..benchmarks import (
12 calculate_metrics,
13 generate_report,
14 run_benchmark,
15 run_browsecomp_benchmark,
16 run_simpleqa_benchmark,
17 run_xbench_deepsearch_benchmark,
18)
21def evaluate_simpleqa(
22 num_examples: int = 100,
23 search_iterations: int = 3,
24 questions_per_iteration: int = 3,
25 search_tool: str = "searxng",
26 human_evaluation: bool = False,
27 evaluation_model: str | None = None,
28 evaluation_provider: str | None = None,
29 output_dir: str = "benchmark_results",
30) -> dict[str, Any]:
31 """
32 Run SimpleQA benchmark evaluation.
34 Args:
35 num_examples: Number of examples to evaluate
36 search_iterations: Number of search iterations per query
37 questions_per_iteration: Number of questions per iteration
38 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
39 human_evaluation: Whether to use human evaluation
40 evaluation_model: Optional custom model for evaluation
41 evaluation_provider: Optional custom provider for evaluation
42 output_dir: Directory to save results
44 Returns:
45 Dictionary with benchmark results
46 """
47 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
49 # Set up search configuration
50 search_config = {
51 "iterations": search_iterations,
52 "questions_per_iteration": questions_per_iteration,
53 "search_tool": search_tool,
54 }
56 # Set up evaluation configuration if needed
57 evaluation_config = None
58 if evaluation_model or evaluation_provider:
59 evaluation_config = {}
60 if evaluation_model: 60 ↛ 62line 60 didn't jump to line 62 because the condition on line 60 was always true
61 evaluation_config["model_name"] = evaluation_model
62 if evaluation_provider: 62 ↛ 66line 62 didn't jump to line 66 because the condition on line 62 was always true
63 evaluation_config["provider"] = evaluation_provider
65 # Run the benchmark
66 results = run_simpleqa_benchmark(
67 num_examples=num_examples,
68 output_dir=output_dir,
69 search_config=search_config,
70 evaluation_config=evaluation_config,
71 human_evaluation=human_evaluation,
72 )
74 return results
77def evaluate_browsecomp(
78 num_examples: int = 100,
79 search_iterations: int = 3,
80 questions_per_iteration: int = 3,
81 search_tool: str = "searxng",
82 human_evaluation: bool = False,
83 evaluation_model: str | None = None,
84 evaluation_provider: str | None = None,
85 output_dir: str = "benchmark_results",
86) -> dict[str, Any]:
87 """
88 Run BrowseComp benchmark evaluation.
90 Args:
91 num_examples: Number of examples to evaluate
92 search_iterations: Number of search iterations per query
93 questions_per_iteration: Number of questions per iteration
94 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
95 human_evaluation: Whether to use human evaluation
96 evaluation_model: Optional custom model for evaluation
97 evaluation_provider: Optional custom provider for evaluation
98 output_dir: Directory to save results
100 Returns:
101 Dictionary with benchmark results
102 """
103 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
105 # Set up search configuration
106 search_config = {
107 "iterations": search_iterations,
108 "questions_per_iteration": questions_per_iteration,
109 "search_tool": search_tool,
110 }
112 # Set up evaluation configuration if needed
113 evaluation_config = None
114 if evaluation_model or evaluation_provider: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 evaluation_config = {}
116 if evaluation_model:
117 evaluation_config["model_name"] = evaluation_model
118 if evaluation_provider:
119 evaluation_config["provider"] = evaluation_provider
121 # Run the benchmark
122 results = run_browsecomp_benchmark(
123 num_examples=num_examples,
124 output_dir=output_dir,
125 search_config=search_config,
126 evaluation_config=evaluation_config,
127 human_evaluation=human_evaluation,
128 )
130 return results
133def evaluate_xbench_deepsearch(
134 num_examples: int = 100,
135 search_iterations: int = 4,
136 questions_per_iteration: int = 3,
137 search_tool: str = "searxng",
138 human_evaluation: bool = False,
139 evaluation_model: str | None = None,
140 evaluation_provider: str | None = None,
141 output_dir: str = "benchmark_results",
142) -> dict[str, Any]:
143 """
144 Run xbench-DeepSearch benchmark evaluation.
146 Args:
147 num_examples: Number of examples to evaluate (default 100 - full dataset)
148 search_iterations: Number of search iterations per query
149 questions_per_iteration: Number of questions per iteration
150 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
151 human_evaluation: Whether to use human evaluation
152 evaluation_model: Optional custom model for evaluation
153 evaluation_provider: Optional custom provider for evaluation
154 output_dir: Directory to save results
156 Returns:
157 Dictionary with benchmark results
158 """
159 logger.info(
160 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"
161 )
163 # Set up search configuration
164 search_config = {
165 "iterations": search_iterations,
166 "questions_per_iteration": questions_per_iteration,
167 "search_tool": search_tool,
168 }
170 # Set up evaluation configuration if needed
171 evaluation_config = None
172 if evaluation_model or evaluation_provider: 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 evaluation_config = {}
174 if evaluation_model:
175 evaluation_config["model_name"] = evaluation_model
176 if evaluation_provider:
177 evaluation_config["provider"] = evaluation_provider
179 # Run the benchmark
180 results = run_xbench_deepsearch_benchmark(
181 num_examples=num_examples,
182 output_dir=output_dir,
183 search_config=search_config,
184 evaluation_config=evaluation_config,
185 human_evaluation=human_evaluation,
186 )
188 return results
191def get_available_benchmarks() -> list[dict[str, str]]:
192 """
193 Get information about available benchmarks.
195 Returns:
196 List of dictionaries with benchmark information
197 """
198 return [
199 {
200 "id": "simpleqa",
201 "name": "SimpleQA",
202 "description": "Benchmark for factual question answering",
203 "recommended_examples": 100,
204 },
205 {
206 "id": "browsecomp",
207 "name": "BrowseComp",
208 "description": "Benchmark for web browsing comprehension",
209 "recommended_examples": 100,
210 },
211 {
212 "id": "xbench_deepsearch",
213 "name": "xbench-DeepSearch",
214 "description": "Benchmark for deep search and investigation queries",
215 "recommended_examples": 100,
216 },
217 ]
220def compare_configurations(
221 dataset_type: str = "simpleqa",
222 num_examples: int = 20,
223 configurations: list[dict[str, Any]] = None,
224 output_dir: str = "benchmark_comparisons",
225) -> dict[str, Any]:
226 """
227 Compare multiple search configurations on the same benchmark.
229 Args:
230 dataset_type: Type of dataset to use
231 num_examples: Number of examples to evaluate
232 configurations: List of search configurations to compare
233 output_dir: Directory to save results
235 Returns:
236 Dictionary with comparison results
237 """
238 if not configurations:
239 # Default configurations to compare
240 configurations = [
241 {
242 "name": "Base Config",
243 "search_tool": "searxng",
244 "iterations": 1,
245 "questions_per_iteration": 3,
246 },
247 {
248 "name": "More Iterations",
249 "search_tool": "searxng",
250 "iterations": 3,
251 "questions_per_iteration": 3,
252 },
253 {
254 "name": "More Questions",
255 "search_tool": "searxng",
256 "iterations": 1,
257 "questions_per_iteration": 5,
258 },
259 ]
261 # Create output directory
262 import os
264 os.makedirs(output_dir, exist_ok=True)
266 # Run benchmarks for each configuration
267 results = []
268 for config in configurations:
269 config_name = config.pop("name", f"Config-{len(results)}")
271 logger.info(f"Running benchmark with configuration: {config_name}")
273 search_config = {
274 "iterations": config.pop("iterations", 1),
275 "questions_per_iteration": config.pop("questions_per_iteration", 3),
276 "search_tool": config.pop("search_tool", "searxng"),
277 }
279 # Add any remaining config items
280 for key, value in config.items(): 280 ↛ 281line 280 didn't jump to line 281 because the loop on line 280 never started
281 search_config[key] = value
283 # Run benchmark with this configuration
284 benchmark_result = run_benchmark(
285 dataset_type=dataset_type,
286 num_examples=num_examples,
287 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),
288 search_config=search_config,
289 run_evaluation=True,
290 )
292 # Add configuration name to results
293 benchmark_result["configuration_name"] = config_name
294 benchmark_result["search_config"] = search_config
296 results.append(benchmark_result)
298 # Generate comparison report
299 import time
300 from ..security.file_write_verifier import write_file_verified
302 timestamp = time.strftime("%Y%m%d_%H%M%S")
303 report_file = str(
304 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"
305 )
307 # Build report content
308 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"
310 # Write summary table
311 content += "## Summary\n\n"
312 content += "| Configuration | Accuracy | Avg. Time | Examples |\n"
313 content += "|---------------|----------|-----------|----------|\n"
315 for result in results:
316 accuracy = result.get("metrics", {}).get("accuracy", 0)
317 avg_time = result.get("metrics", {}).get("average_processing_time", 0)
318 examples = result.get("total_examples", 0)
320 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
322 content += "\n## Configuration Details\n\n"
324 for result in results:
325 content += f"### {result['configuration_name']}\n\n"
327 config = result.get("search_config", {})
328 content += "```\n"
329 for key, value in config.items():
330 content += f"{key}: {value}\n"
331 content += "```\n\n"
333 write_file_verified(
334 report_file,
335 content,
336 "benchmark.allow_file_output",
337 context="benchmark comparison report",
338 )
340 logger.info(f"Comparison report saved to {report_file}")
342 return {
343 "status": "complete",
344 "dataset_type": dataset_type,
345 "configurations_tested": len(configurations),
346 "report_path": report_file,
347 "results": results,
348 }
351# Export the API functions
352__all__ = [
353 "calculate_metrics",
354 "compare_configurations",
355 "evaluate_browsecomp",
356 "evaluate_simpleqa",
357 "evaluate_xbench_deepsearch",
358 "generate_report",
359 "get_available_benchmarks",
360 "run_benchmark", # For advanced users
361]