Coverage for src / local_deep_research / benchmarks / benchmark_functions.py: 95%
131 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2API functions for benchmarking.
4This module provides functions for running benchmarks programmatically.
5"""
7from pathlib import Path
8from typing import Any, Dict, List, Optional
10from loguru import logger
12from ..config.thread_settings import get_setting_from_snapshot
13from ..llm.providers.base import normalize_provider
15from ..benchmarks import (
16 calculate_metrics,
17 generate_report,
18 run_benchmark,
19 run_browsecomp_benchmark,
20 run_simpleqa_benchmark,
21 run_xbench_deepsearch_benchmark,
22)
25def evaluate_simpleqa(
26 num_examples: int = 100,
27 search_iterations: int = 3,
28 questions_per_iteration: int = 3,
29 search_tool: str = "searxng",
30 human_evaluation: bool = False,
31 evaluation_model: Optional[str] = None,
32 evaluation_provider: Optional[str] = None,
33 output_dir: str = "benchmark_results",
34 search_model: Optional[str] = None,
35 search_provider: Optional[str] = None,
36 endpoint_url: Optional[str] = None,
37 search_strategy: str = "source_based",
38) -> Dict[str, Any]:
39 """
40 Run SimpleQA benchmark evaluation.
42 Args:
43 num_examples: Number of examples to evaluate
44 search_iterations: Number of search iterations per query
45 questions_per_iteration: Number of questions per iteration
46 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
47 human_evaluation: Whether to use human evaluation
48 evaluation_model: Optional custom model for evaluation
49 evaluation_provider: Optional custom provider for evaluation
50 output_dir: Directory to save results
51 search_model: Optional model to use for the search system
52 search_provider: Optional provider to use for the search system
53 endpoint_url: Optional endpoint URL for OpenRouter or other API services
54 search_strategy: Search strategy to use (default: 'source_based')
56 Returns:
57 Dictionary with benchmark results
58 """
59 logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
61 # Set up search configuration
62 search_config = {
63 "iterations": search_iterations,
64 "questions_per_iteration": questions_per_iteration,
65 "search_tool": search_tool,
66 "search_strategy": search_strategy,
67 }
69 # Add model configurations if provided
70 if search_model:
71 search_config["model_name"] = search_model
72 if search_provider:
73 search_config["provider"] = search_provider
74 if endpoint_url:
75 search_config["openai_endpoint_url"] = endpoint_url
77 # Check settings for additional configuration
78 if env_model := get_setting_from_snapshot("llm.model"):
79 search_config["model_name"] = env_model
80 if env_provider := get_setting_from_snapshot("llm.provider"):
81 search_config["provider"] = env_provider
82 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
83 search_config["openai_endpoint_url"] = env_url
85 # Set up evaluation configuration if needed
86 evaluation_config = None
87 if evaluation_model or evaluation_provider:
88 evaluation_config = {
89 "temperature": 0 # Always use zero temperature for evaluation
90 }
91 if evaluation_model: 91 ↛ 93line 91 didn't jump to line 93 because the condition on line 91 was always true
92 evaluation_config["model_name"] = evaluation_model
93 if evaluation_provider:
94 evaluation_provider = normalize_provider(evaluation_provider)
95 evaluation_config["provider"] = evaluation_provider
96 # Add endpoint URL if using openai_endpoint
97 if evaluation_provider == "openai_endpoint" and endpoint_url:
98 evaluation_config["openai_endpoint_url"] = endpoint_url
99 elif evaluation_provider == "openai_endpoint" and env_url:
100 evaluation_config["openai_endpoint_url"] = env_url
102 # Run the benchmark
103 return run_simpleqa_benchmark(
104 num_examples=num_examples,
105 output_dir=output_dir,
106 search_config=search_config,
107 evaluation_config=evaluation_config,
108 human_evaluation=human_evaluation,
109 )
112def evaluate_browsecomp(
113 num_examples: int = 100,
114 search_iterations: int = 3,
115 questions_per_iteration: int = 3,
116 search_tool: str = "searxng",
117 human_evaluation: bool = False,
118 evaluation_model: Optional[str] = None,
119 evaluation_provider: Optional[str] = None,
120 output_dir: str = "benchmark_results",
121 search_model: Optional[str] = None,
122 search_provider: Optional[str] = None,
123 endpoint_url: Optional[str] = None,
124 search_strategy: str = "source_based",
125) -> Dict[str, Any]:
126 """
127 Run BrowseComp benchmark evaluation.
129 Args:
130 num_examples: Number of examples to evaluate
131 search_iterations: Number of search iterations per query
132 questions_per_iteration: Number of questions per iteration
133 search_tool: Search engine to use (e.g., 'searxng', 'wikipedia')
134 human_evaluation: Whether to use human evaluation
135 evaluation_model: Optional custom model for evaluation
136 evaluation_provider: Optional custom provider for evaluation
137 output_dir: Directory to save results
138 search_model: Optional model to use for the search system
139 search_provider: Optional provider to use for the search system
140 endpoint_url: Optional endpoint URL for OpenRouter or other API services
141 search_strategy: Search strategy to use (default: 'source_based')
143 Returns:
144 Dictionary with benchmark results
145 """
146 logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
148 # Set up search configuration
149 search_config = {
150 "iterations": search_iterations,
151 "questions_per_iteration": questions_per_iteration,
152 "search_tool": search_tool,
153 "search_strategy": search_strategy,
154 }
156 # Add model configurations if provided
157 if search_model:
158 search_config["model_name"] = search_model
159 if search_provider:
160 search_config["provider"] = search_provider
161 if endpoint_url:
162 search_config["openai_endpoint_url"] = endpoint_url
164 # Check settings for additional configuration
165 if env_model := get_setting_from_snapshot("llm.model"): 165 ↛ 166line 165 didn't jump to line 166 because the condition on line 165 was never true
166 search_config["model_name"] = env_model
167 if env_provider := get_setting_from_snapshot("llm.provider"): 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true
168 search_config["provider"] = env_provider
169 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
170 search_config["openai_endpoint_url"] = env_url
172 # Set up evaluation configuration if needed
173 evaluation_config = None
174 if evaluation_model or evaluation_provider:
175 evaluation_config = {
176 "temperature": 0 # Always use zero temperature for evaluation
177 }
178 if evaluation_model:
179 evaluation_config["model_name"] = evaluation_model
180 if evaluation_provider: 180 ↛ 190line 180 didn't jump to line 190 because the condition on line 180 was always true
181 evaluation_provider = normalize_provider(evaluation_provider)
182 evaluation_config["provider"] = evaluation_provider
183 # Add endpoint URL if using openai_endpoint
184 if evaluation_provider == "openai_endpoint" and endpoint_url:
185 evaluation_config["openai_endpoint_url"] = endpoint_url
186 elif evaluation_provider == "openai_endpoint" and env_url:
187 evaluation_config["openai_endpoint_url"] = env_url
189 # Run the benchmark
190 return run_browsecomp_benchmark(
191 num_examples=num_examples,
192 output_dir=output_dir,
193 search_config=search_config,
194 evaluation_config=evaluation_config,
195 human_evaluation=human_evaluation,
196 )
199def evaluate_xbench_deepsearch(
200 num_examples: int = 100,
201 search_iterations: int = 4,
202 questions_per_iteration: int = 3,
203 search_tool: str = "searxng",
204 human_evaluation: bool = False,
205 evaluation_model: Optional[str] = None,
206 evaluation_provider: Optional[str] = None,
207 output_dir: str = "benchmark_results",
208 search_model: Optional[str] = None,
209 search_provider: Optional[str] = None,
210 endpoint_url: Optional[str] = None,
211 search_strategy: str = "source_based",
212) -> Dict[str, Any]:
213 """
214 Run xbench-DeepSearch benchmark evaluation.
216 Args:
217 num_examples: Number of examples to evaluate (default 100 - full dataset)
218 search_iterations: Number of search iterations per query
219 questions_per_iteration: Number of questions per iteration
220 search_tool: Search engine to use
221 human_evaluation: Whether to use human evaluation
222 evaluation_model: Optional custom model for evaluation
223 evaluation_provider: Optional custom provider for evaluation
224 output_dir: Directory to save results
225 search_model: Optional model to use for the search system
226 search_provider: Optional provider to use for the search system
227 endpoint_url: Optional endpoint URL for API services
228 search_strategy: Search strategy to use
230 Returns:
231 Dictionary with benchmark results
232 """
233 logger.info(
234 f"Starting xbench-DeepSearch benchmark with {num_examples} examples"
235 )
237 # Set up search configuration
238 search_config = {
239 "iterations": search_iterations,
240 "questions_per_iteration": questions_per_iteration,
241 "search_tool": search_tool,
242 "search_strategy": search_strategy,
243 }
245 # Add model configurations if provided
246 if search_model:
247 search_config["model_name"] = search_model
248 if search_provider:
249 search_config["provider"] = search_provider
250 if endpoint_url:
251 search_config["openai_endpoint_url"] = endpoint_url
253 # Check settings for additional configuration
254 if env_model := get_setting_from_snapshot("llm.model"):
255 search_config["model_name"] = env_model
256 if env_provider := get_setting_from_snapshot("llm.provider"):
257 search_config["provider"] = env_provider
258 if env_url := get_setting_from_snapshot("llm.openai_endpoint.url"):
259 search_config["openai_endpoint_url"] = env_url
261 # Set up evaluation configuration if needed
262 evaluation_config = None
263 if evaluation_model or evaluation_provider:
264 evaluation_config = {
265 "temperature": 0 # Always use zero temperature for evaluation
266 }
267 if evaluation_model: 267 ↛ 269line 267 didn't jump to line 269 because the condition on line 267 was always true
268 evaluation_config["model_name"] = evaluation_model
269 if evaluation_provider: 269 ↛ 279line 269 didn't jump to line 279 because the condition on line 269 was always true
270 evaluation_provider = normalize_provider(evaluation_provider)
271 evaluation_config["provider"] = evaluation_provider
272 # Add endpoint URL if using openai_endpoint
273 if evaluation_provider == "openai_endpoint" and endpoint_url: 273 ↛ 274line 273 didn't jump to line 274 because the condition on line 273 was never true
274 evaluation_config["openai_endpoint_url"] = endpoint_url
275 elif evaluation_provider == "openai_endpoint" and env_url:
276 evaluation_config["openai_endpoint_url"] = env_url
278 # Run the benchmark
279 return run_xbench_deepsearch_benchmark(
280 num_examples=num_examples,
281 output_dir=output_dir,
282 search_config=search_config,
283 evaluation_config=evaluation_config,
284 human_evaluation=human_evaluation,
285 )
288def get_available_benchmarks() -> List[Dict[str, str]]:
289 """
290 Get information about available benchmarks.
292 Returns:
293 List of dictionaries with benchmark information
294 """
295 return [
296 {
297 "id": "simpleqa",
298 "name": "SimpleQA",
299 "description": "Benchmark for factual question answering",
300 "recommended_examples": 100,
301 },
302 {
303 "id": "browsecomp",
304 "name": "BrowseComp",
305 "description": "Benchmark for web browsing comprehension",
306 "recommended_examples": 100,
307 },
308 {
309 "id": "xbench_deepsearch",
310 "name": "xbench-DeepSearch",
311 "description": "Deep research and search capability evaluation",
312 "recommended_examples": 100,
313 },
314 ]
317def compare_configurations(
318 dataset_type: str = "simpleqa",
319 num_examples: int = 20,
320 configurations: List[Dict[str, Any]] = None,
321 output_dir: str = "benchmark_comparisons",
322) -> Dict[str, Any]:
323 """
324 Compare multiple search configurations on the same benchmark.
326 Args:
327 dataset_type: Type of dataset to use
328 num_examples: Number of examples to evaluate
329 configurations: List of search configurations to compare
330 output_dir: Directory to save results
332 Returns:
333 Dictionary with comparison results
334 """
335 if not configurations:
336 # Default configurations to compare
337 configurations = [
338 {
339 "name": "Base Config",
340 "search_tool": "searxng",
341 "iterations": 1,
342 "questions_per_iteration": 3,
343 },
344 {
345 "name": "More Iterations",
346 "search_tool": "searxng",
347 "iterations": 3,
348 "questions_per_iteration": 3,
349 },
350 {
351 "name": "More Questions",
352 "search_tool": "searxng",
353 "iterations": 1,
354 "questions_per_iteration": 5,
355 },
356 ]
358 # Create output directory
360 Path(output_dir).mkdir(parents=True, exist_ok=True)
362 # Run benchmarks for each configuration
363 results = []
364 for config in configurations:
365 config_name = config.pop("name", f"Config-{len(results)}")
367 logger.info(f"Running benchmark with configuration: {config_name}")
369 search_config = {
370 "iterations": config.pop("iterations", 1),
371 "questions_per_iteration": config.pop("questions_per_iteration", 3),
372 "search_tool": config.pop("search_tool", "searxng"),
373 }
375 # Add any remaining config items
376 search_config.update(config)
378 # Run benchmark with this configuration
379 benchmark_result = run_benchmark(
380 dataset_type=dataset_type,
381 num_examples=num_examples,
382 output_dir=str(Path(output_dir) / config_name.replace(" ", "_")),
383 search_config=search_config,
384 run_evaluation=True,
385 )
387 # Add configuration name to results
388 benchmark_result["configuration_name"] = config_name
389 benchmark_result["search_config"] = search_config
391 results.append(benchmark_result)
393 # Generate comparison report
394 import time
396 from ..security.file_write_verifier import write_file_verified
398 timestamp = time.strftime("%Y%m%d_%H%M%S")
399 report_file = str(
400 Path(output_dir) / f"comparison_{dataset_type}_{timestamp}.md"
401 )
403 # Build report content
404 content = f"# Configuration Comparison - {dataset_type.capitalize()}\n\n"
406 # Write summary table
407 content += "## Summary\n\n"
408 content += "| Configuration | Accuracy | Avg. Time | Examples |\n"
409 content += "|---------------|----------|-----------|----------|\n"
411 for result in results:
412 accuracy = result.get("metrics", {}).get("accuracy", 0)
413 avg_time = result.get("metrics", {}).get("average_processing_time", 0)
414 examples = result.get("total_examples", 0)
416 content += f"| {result['configuration_name']} | {accuracy:.3f} | {avg_time:.2f}s | {examples} |\n"
418 content += "\n## Configuration Details\n\n"
420 for result in results:
421 content += f"### {result['configuration_name']}\n\n"
423 config = result.get("search_config", {})
424 content += "```\n"
425 for key, value in config.items():
426 content += f"{key}: {value}\n"
427 content += "```\n\n"
429 write_file_verified(
430 report_file,
431 content,
432 "benchmark.allow_file_output",
433 context="benchmark comparison report",
434 )
436 logger.info(f"Comparison report saved to {report_file}")
438 return {
439 "status": "complete",
440 "dataset_type": dataset_type,
441 "configurations_tested": len(configurations),
442 "report_path": report_file,
443 "results": results,
444 }
447# Export the API functions
448__all__ = [
449 "calculate_metrics",
450 "compare_configurations",
451 "evaluate_browsecomp",
452 "evaluate_simpleqa",
453 "generate_report",
454 "get_available_benchmarks",
455 "run_benchmark", # For advanced users
456]