Coverage for src / local_deep_research / benchmarks / benchmark_functions.py: 0%
131 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2API functions for benchmarking.
4This module provides functions for running benchmarks programmatically.
5"""
7from pathlib import Path
8from typing import Any, Dict, List, Optional
10from loguru import logger
12from ..config.thread_settings import get_setting_from_snapshot
14from ..benchmarks import (
15 calculate_metrics,
16 generate_report,
17 run_benchmark,
18 run_browsecomp_benchmark,
19 run_simpleqa_benchmark,
20 run_xbench_deepsearch_benchmark,
21)
24def evaluate_simpleqa(
25 num_examples: int = 100,
26 search_iterations: int = 3,
27 questions_per_iteration: int = 3,
28 search_tool: str = "searxng",
29 human_evaluation: bool = False,
30 evaluation_model: Optional[str] = None,
31 evaluation_provider: Optional[str] = None,
32 output_dir: str = "benchmark_results",
33 search_model: Optional[str] = None,
34 search_provider: Optional[str] = None,
35 endpoint_url: Optional[str] = None,
36 search_strategy: str = "source_based",
37) -> Dict[str, Any]:
38 """
39 Run SimpleQA benchmark evaluation.
41 Args:
42 num_examples: Number of examples to evaluate
43 search_iterations: Number of search iterations per query
44 questions_per_iteration: Number of questions per iteration
45 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
46 human_evaluation: Whether to use human evaluation
47 evaluation_model: Optional custom model for evaluation
48 evaluation_provider: Optional custom provider for evaluation
49 output_dir: Directory to save results
50 search_model: Optional model to use for the search system
51 search_provider: Optional provider to use for the search system
52 endpoint_url: Optional endpoint URL for OpenRouter or other API services
53 search_strategy: Search strategy to use (default: 'source_based')
55 Returns:
56 Dictionary with benchmark results
57 """
58 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
60 # Set up search configuration
61 search_config = {
62 "iterations": search_iterations,
63 "questions_per_iteration": questions_per_iteration,
64 "search_tool": search_tool,
65 "search_strategy": search_strategy,
66 }
68 # Add model configurations if provided
69 if search_model:
70 search_config["model_name"] = search_model
71 if search_provider:
72 search_config["provider"] = search_provider
73 if endpoint_url:
74 search_config["openai_endpoint_url"] = endpoint_url
76 # Check settings for additional configuration
77 if env_model := get_setting_from_snapshot("llm.model"):
78 search_config["model_name"] = env_model
79 if env_provider := get_setting_from_snapshot("llm.provider"):
80 search_config["provider"] = env_provider
81 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
82 search_config["openai_endpoint_url"] = env_url
84 # Set up evaluation configuration if needed
85 evaluation_config = None
86 if evaluation_model or evaluation_provider:
87 evaluation_config = {
88 "temperature": 0 # Always use zero temperature for evaluation
89 }
90 if evaluation_model:
91 evaluation_config["model_name"] = evaluation_model
92 if evaluation_provider:
93 evaluation_config["provider"] = evaluation_provider
94 # Add endpoint URL if using openai_endpoint
95 if evaluation_provider == "openai_endpoint" and endpoint_url:
96 evaluation_config["openai_endpoint_url"] = endpoint_url
97 elif evaluation_provider == "openai_endpoint" and env_url:
98 evaluation_config["openai_endpoint_url"] = env_url
100 # Run the benchmark
101 results = run_simpleqa_benchmark(
102 num_examples=num_examples,
103 output_dir=output_dir,
104 search_config=search_config,
105 evaluation_config=evaluation_config,
106 human_evaluation=human_evaluation,
107 )
109 return results
112def evaluate_browsecomp(
113 num_examples: int = 100,
114 search_iterations: int = 3,
115 questions_per_iteration: int = 3,
116 search_tool: str = "searxng",
117 human_evaluation: bool = False,
118 evaluation_model: Optional[str] = None,
119 evaluation_provider: Optional[str] = None,
120 output_dir: str = "benchmark_results",
121 search_model: Optional[str] = None,
122 search_provider: Optional[str] = None,
123 endpoint_url: Optional[str] = None,
124 search_strategy: str = "source_based",
125) -> Dict[str, Any]:
126 """
127 Run BrowseComp benchmark evaluation.
129 Args:
130 num_examples: Number of examples to evaluate
131 search_iterations: Number of search iterations per query
132 questions_per_iteration: Number of questions per iteration
133 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
134 human_evaluation: Whether to use human evaluation
135 evaluation_model: Optional custom model for evaluation
136 evaluation_provider: Optional custom provider for evaluation
137 output_dir: Directory to save results
138 search_model: Optional model to use for the search system
139 search_provider: Optional provider to use for the search system
140 endpoint_url: Optional endpoint URL for OpenRouter or other API services
141 search_strategy: Search strategy to use (default: 'source_based')
143 Returns:
144 Dictionary with benchmark results
145 """
146 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
148 # Set up search configuration
149 search_config = {
150 "iterations": search_iterations,
151 "questions_per_iteration": questions_per_iteration,
152 "search_tool": search_tool,
153 "search_strategy": search_strategy,
154 }
156 # Add model configurations if provided
157 if search_model:
158 search_config["model_name"] = search_model
159 if search_provider:
160 search_config["provider"] = search_provider
161 if endpoint_url:
162 search_config["openai_endpoint_url"] = endpoint_url
164 # Check settings for additional configuration
165 if env_model := get_setting_from_snapshot("llm.model"):
166 search_config["model_name"] = env_model
167 if env_provider := get_setting_from_snapshot("llm.provider"):
168 search_config["provider"] = env_provider
169 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
170 search_config["openai_endpoint_url"] = env_url
172 # Set up evaluation configuration if needed
173 evaluation_config = None
174 if evaluation_model or evaluation_provider:
175 evaluation_config = {
176 "temperature": 0 # Always use zero temperature for evaluation
177 }
178 if evaluation_model:
179 evaluation_config["model_name"] = evaluation_model
180 if evaluation_provider:
181 evaluation_config["provider"] = evaluation_provider
182 # Add endpoint URL if using openai_endpoint
183 if evaluation_provider == "openai_endpoint" and endpoint_url:
184 evaluation_config["openai_endpoint_url"] = endpoint_url
185 elif evaluation_provider == "openai_endpoint" and env_url:
186 evaluation_config["openai_endpoint_url"] = env_url
188 # Run the benchmark
189 results = run_browsecomp_benchmark(
190 num_examples=num_examples,
191 output_dir=output_dir,
192 search_config=search_config,
193 evaluation_config=evaluation_config,
194 human_evaluation=human_evaluation,
195 )
197 return results
200def evaluate_xbench_deepsearch(
201 num_examples: int = 100,
202 search_iterations: int = 4,
203 questions_per_iteration: int = 3,
204 search_tool: str = "searxng",
205 human_evaluation: bool = False,
206 evaluation_model: Optional[str] = None,
207 evaluation_provider: Optional[str] = None,
208 output_dir: str = "benchmark_results",
209 search_model: Optional[str] = None,
210 search_provider: Optional[str] = None,
211 endpoint_url: Optional[str] = None,
212 search_strategy: str = "source_based",
213) -> Dict[str, Any]:
214 """
215 Run xbench-DeepSearch benchmark evaluation.
217 Args:
218 num_examples: Number of examples to evaluate (default 100 - full dataset)
219 search_iterations: Number of search iterations per query
220 questions_per_iteration: Number of questions per iteration
221 search_tool: Search engine to use
222 human_evaluation: Whether to use human evaluation
223 evaluation_model: Optional custom model for evaluation
224 evaluation_provider: Optional custom provider for evaluation
225 output_dir: Directory to save results
226 search_model: Optional model to use for the search system
227 search_provider: Optional provider to use for the search system
228 endpoint_url: Optional endpoint URL for API services
229 search_strategy: Search strategy to use
231 Returns:
232 Dictionary with benchmark results
233 """
234 logger.info(
235 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"
236 )
238 # Set up search configuration
239 search_config = {
240 "iterations": search_iterations,
241 "questions_per_iteration": questions_per_iteration,
242 "search_tool": search_tool,
243 "search_strategy": search_strategy,
244 }
246 # Add model configurations if provided
247 if search_model:
248 search_config["model_name"] = search_model
249 if search_provider:
250 search_config["provider"] = search_provider
251 if endpoint_url:
252 search_config["openai_endpoint_url"] = endpoint_url
254 # Check settings for additional configuration
255 if env_model := get_setting_from_snapshot("llm.model"):
256 search_config["model_name"] = env_model
257 if env_provider := get_setting_from_snapshot("llm.provider"):
258 search_config["provider"] = env_provider
259 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
260 search_config["openai_endpoint_url"] = env_url
262 # Set up evaluation configuration if needed
263 evaluation_config = None
264 if evaluation_model or evaluation_provider:
265 evaluation_config = {
266 "temperature": 0 # Always use zero temperature for evaluation
267 }
268 if evaluation_model:
269 evaluation_config["model_name"] = evaluation_model
270 if evaluation_provider:
271 evaluation_config["provider"] = evaluation_provider
272 # Add endpoint URL if using openai_endpoint
273 if evaluation_provider == "openai_endpoint" and endpoint_url:
274 evaluation_config["openai_endpoint_url"] = endpoint_url
275 elif evaluation_provider == "openai_endpoint" and env_url:
276 evaluation_config["openai_endpoint_url"] = env_url
278 # Run the benchmark
279 results = run_xbench_deepsearch_benchmark(
280 num_examples=num_examples,
281 output_dir=output_dir,
282 search_config=search_config,
283 evaluation_config=evaluation_config,
284 human_evaluation=human_evaluation,
285 )
287 return results
290def get_available_benchmarks() -> List[Dict[str, str]]:
291 """
292 Get information about available benchmarks.
294 Returns:
295 List of dictionaries with benchmark information
296 """
297 return [
298 {
299 "id": "simpleqa",
300 "name": "SimpleQA",
301 "description": "Benchmark for factual question answering",
302 "recommended_examples": 100,
303 },
304 {
305 "id": "browsecomp",
306 "name": "BrowseComp",
307 "description": "Benchmark for web browsing comprehension",
308 "recommended_examples": 100,
309 },
310 {
311 "id": "xbench_deepsearch",
312 "name": "xbench-DeepSearch",
313 "description": "Deep research and search capability evaluation",
314 "recommended_examples": 100,
315 },
316 ]
319def compare_configurations(
320 dataset_type: str = "simpleqa",
321 num_examples: int = 20,
322 configurations: List[Dict[str, Any]] = None,
323 output_dir: str = "benchmark_comparisons",
324) -> Dict[str, Any]:
325 """
326 Compare multiple search configurations on the same benchmark.
328 Args:
329 dataset_type: Type of dataset to use
330 num_examples: Number of examples to evaluate
331 configurations: List of search configurations to compare
332 output_dir: Directory to save results
334 Returns:
335 Dictionary with comparison results
336 """
337 if not configurations:
338 # Default configurations to compare
339 configurations = [
340 {
341 "name": "Base Config",
342 "search_tool": "searxng",
343 "iterations": 1,
344 "questions_per_iteration": 3,
345 },
346 {
347 "name": "More Iterations",
348 "search_tool": "searxng",
349 "iterations": 3,
350 "questions_per_iteration": 3,
351 },
352 {
353 "name": "More Questions",
354 "search_tool": "searxng",
355 "iterations": 1,
356 "questions_per_iteration": 5,
357 },
358 ]
360 # Create output directory
362 Path(output_dir).mkdir(parents=True, exist_ok=True)
364 # Run benchmarks for each configuration
365 results = []
366 for config in configurations:
367 config_name = config.pop("name", f"Config-{len(results)}")
369 logger.info(f"Running benchmark with configuration: {config_name}")
371 search_config = {
372 "iterations": config.pop("iterations", 1),
373 "questions_per_iteration": config.pop("questions_per_iteration", 3),
374 "search_tool": config.pop("search_tool", "searxng"),
375 }
377 # Add any remaining config items
378 for key, value in config.items():
379 search_config[key] = value
381 # Run benchmark with this configuration
382 benchmark_result = run_benchmark(
383 dataset_type=dataset_type,
384 num_examples=num_examples,
385 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),
386 search_config=search_config,
387 run_evaluation=True,
388 )
390 # Add configuration name to results
391 benchmark_result["configuration_name"] = config_name
392 benchmark_result["search_config"] = search_config
394 results.append(benchmark_result)
396 # Generate comparison report
397 import time
399 from ..security.file_write_verifier import write_file_verified
401 timestamp = time.strftime("%Y%m%d_%H%M%S")
402 report_file = str(
403 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"
404 )
406 # Build report content
407 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"
409 # Write summary table
410 content += "## Summary\n\n"
411 content += "| Configuration | Accuracy | Avg. Time | Examples |\n"
412 content += "|---------------|----------|-----------|----------|\n"
414 for result in results:
415 accuracy = result.get("metrics", {}).get("accuracy", 0)
416 avg_time = result.get("metrics", {}).get("average_processing_time", 0)
417 examples = result.get("total_examples", 0)
419 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
421 content += "\n## Configuration Details\n\n"
423 for result in results:
424 content += f"### {result['configuration_name']}\n\n"
426 config = result.get("search_config", {})
427 content += "```\n"
428 for key, value in config.items():
429 content += f"{key}: {value}\n"
430 content += "```\n\n"
432 write_file_verified(
433 report_file,
434 content,
435 "benchmark.allow_file_output",
436 context="benchmark comparison report",
437 )
439 logger.info(f"Comparison report saved to {report_file}")
441 return {
442 "status": "complete",
443 "dataset_type": dataset_type,
444 "configurations_tested": len(configurations),
445 "report_path": report_file,
446 "results": results,
447 }
450# Export the API functions
451__all__ = [
452 "calculate_metrics",
453 "compare_configurations",
454 "evaluate_browsecomp",
455 "evaluate_simpleqa",
456 "generate_report",
457 "get_available_benchmarks",
458 "run_benchmark", # For advanced users
459]