Coverage for src / local_deep_research / benchmarks / runners.py: 10%
122 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Benchmark runners for Local Deep Research.
4This module provides the main functions for running benchmarks using LDR.
5"""
7import json
8from loguru import logger
9import os
10from pathlib import Path
11import time
12from typing import Any, Callable, Dict, Optional
14from ..api import quick_summary
15from .datasets import DEFAULT_DATASET_URLS, load_dataset
16from .datasets.base import DatasetRegistry
17from .graders import extract_answer_from_response, grade_results
18from .metrics import calculate_metrics, generate_report
19from .templates import BROWSECOMP_QUERY_TEMPLATE
22def format_query(question: str, dataset_type: str = "simpleqa") -> str:
23 """
24 Format query based on dataset type.
26 Args:
27 question: Original question
28 dataset_type: Type of dataset
30 Returns:
31 Formatted query for LDR
32 """
33 if dataset_type.lower() == "browsecomp":
34 # BrowseComp requires specific formatting
35 return BROWSECOMP_QUERY_TEMPLATE.format(question=question)
37 # Simple format for SimpleQA
38 return question
41def run_benchmark(
42 dataset_type: str,
43 dataset_path: Optional[str] = None,
44 num_examples: Optional[int] = None,
45 output_dir: str = "benchmark_results",
46 run_evaluation: bool = True,
47 evaluation_config: Optional[Dict[str, Any]] = None,
48 search_config: Optional[Dict[str, Any]] = None,
49 human_evaluation: bool = False,
50 progress_callback: Optional[Callable[[str, int, Dict], None]] = None,
51 seed: int = 42,
52) -> Dict[str, Any]:
53 """
54 Run a benchmark on the specified dataset.
56 Args:
57 dataset_type: Type of dataset ("simpleqa" or "browsecomp")
58 dataset_path: Optional custom dataset path
59 num_examples: Number of examples to use
60 output_dir: Directory to save results
61 run_evaluation: Whether to evaluate results
62 evaluation_config: Custom LLM config for evaluation
63 search_config: Custom search parameters
64 human_evaluation: Whether to use human evaluation
65 progress_callback: Optional callback for progress updates
66 seed: Random seed for reproducibility
68 Returns:
69 Dictionary with benchmark results and metrics
70 """
71 # Ensure output directory exists
72 os.makedirs(output_dir, exist_ok=True)
74 # Default search configuration
75 if not search_config:
76 search_config = {
77 "iterations": 3,
78 "questions_per_iteration": 3,
79 "search_tool": "searxng",
80 }
82 # Load dataset using the class-based approach
83 try:
84 # Create the dataset instance from registry
85 dataset_instance = DatasetRegistry.create_dataset(
86 dataset_id=dataset_type.lower(),
87 dataset_path=dataset_path,
88 num_examples=num_examples,
89 seed=seed,
90 )
91 # Load the examples
92 dataset = dataset_instance.load()
94 logger.info(
95 f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}"
96 )
97 except Exception as e:
98 # Fallback to legacy function if there's any issue
99 logger.warning(
100 f"Error using dataset class: {e}. Falling back to legacy function."
101 )
102 dataset = load_dataset(
103 dataset_type=dataset_type,
104 dataset_path=dataset_path,
105 num_examples=num_examples,
106 seed=seed,
107 )
109 # Set up output files
110 timestamp = time.strftime("%Y%m%d_%H%M%S")
111 results_file = str(
112 Path(output_dir) / f"{dataset_type}_{timestamp}_results.jsonl"
113 )
114 evaluation_file = str(
115 Path(output_dir) / f"{dataset_type}_{timestamp}_evaluation.jsonl"
116 )
117 report_file = str(
118 Path(output_dir) / f"{dataset_type}_{timestamp}_report.md"
119 )
121 # Make sure output files don't exist
122 for file in [results_file, evaluation_file, report_file]:
123 file_path = Path(file)
124 if file_path.exists():
125 file_path.unlink()
127 # Progress tracking
128 total_examples = len(dataset)
130 if progress_callback:
131 progress_callback(
132 "Starting benchmark",
133 0,
134 {
135 "status": "started",
136 "dataset_type": dataset_type,
137 "total_examples": total_examples,
138 },
139 )
141 # Process each example
142 results = []
144 for i, example in enumerate(dataset):
145 # Extract question and answer in a way that uses the dataset class when available
146 if "dataset_instance" in locals() and isinstance(
147 dataset_instance,
148 DatasetRegistry.get_dataset_class(dataset_type.lower()),
149 ):
150 # Use the dataset class methods to extract question and answer
151 question = dataset_instance.get_question(example)
152 correct_answer = dataset_instance.get_answer(example)
153 logger.debug(
154 "Using dataset class methods to extract question and answer"
155 )
156 else:
157 # Fallback to the legacy approach
158 if dataset_type.lower() == "simpleqa":
159 question = example.get("problem", "")
160 correct_answer = example.get("answer", "")
161 else: # browsecomp
162 question = example.get("problem", "")
163 # For BrowseComp, the answer should be in "correct_answer" after decryption
164 correct_answer = example.get("correct_answer", "")
165 if not correct_answer and "answer" in example:
166 # Fallback to "answer" field if "correct_answer" is not available
167 correct_answer = example.get("answer", "")
169 # Update progress
170 if progress_callback:
171 progress_callback(
172 f"Processing example {i + 1}/{total_examples}",
173 int(i / total_examples * 50),
174 {
175 "status": "processing",
176 "current": i + 1,
177 "total": total_examples,
178 "question": (
179 question[:50] + "..."
180 if len(question) > 50
181 else question
182 ),
183 },
184 )
186 logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...")
188 try:
189 # Format query based on dataset type
190 formatted_query = format_query(question, dataset_type)
192 # Time the search
193 start_time = time.time()
195 # Get response from LDR
196 search_result = quick_summary(
197 query=formatted_query,
198 iterations=search_config.get("iterations", 3),
199 questions_per_iteration=search_config.get(
200 "questions_per_iteration", 3
201 ),
202 search_tool=search_config.get("search_tool", "searxng"),
203 )
205 end_time = time.time()
206 processing_time = end_time - start_time
208 # Extract response and search info
209 response = search_result.get("summary", "")
211 # Extract structured information
212 extracted = extract_answer_from_response(response, dataset_type)
214 # Format result
215 result = {
216 "id": example.get("id", f"example_{i}"),
217 "problem": question,
218 "correct_answer": correct_answer,
219 "response": response,
220 "extracted_answer": extracted["extracted_answer"],
221 "confidence": extracted["confidence"],
222 "processing_time": processing_time,
223 "sources": search_result.get("sources", []),
224 "search_config": search_config,
225 }
227 # Add to results list
228 results.append(result)
230 # Write result to file
231 with open(results_file, "a") as f:
232 f.write(json.dumps(result) + "\n")
234 # Update progress
235 if progress_callback:
236 progress_callback(
237 f"Completed example {i + 1}/{total_examples}",
238 int((i + 0.5) / total_examples * 50),
239 {
240 "status": "completed_example",
241 "current": i + 1,
242 "total": total_examples,
243 "result": result,
244 },
245 )
247 except Exception as e:
248 logger.exception(f"Error processing example {i + 1}: {e!s}")
250 # Create error result
251 error_result = {
252 "id": example.get("id", f"example_{i}"),
253 "problem": question,
254 "correct_answer": correct_answer,
255 "error": str(e),
256 "processing_time": (
257 time.time() - start_time if "start_time" in locals() else 0
258 ),
259 }
261 # Add to results list
262 results.append(error_result)
264 # Write error result to file
265 with open(results_file, "a") as f:
266 f.write(json.dumps(error_result) + "\n")
268 # Update progress
269 if progress_callback:
270 progress_callback(
271 f"Error processing example {i + 1}/{total_examples}",
272 int((i + 0.5) / total_examples * 50),
273 {
274 "status": "error",
275 "current": i + 1,
276 "total": total_examples,
277 "error": str(e),
278 "result": error_result,
279 },
280 )
282 logger.info(f"Completed processing {total_examples} examples")
284 # Run evaluation if requested
285 if run_evaluation:
286 if progress_callback:
287 progress_callback(
288 "Starting evaluation",
289 50,
290 {"status": "evaluating", "results_file": results_file},
291 )
293 if human_evaluation:
294 from .graders import human_evaluation as evaluate
296 logger.info("Running human evaluation...")
297 evaluation_results = evaluate(
298 results_file=results_file,
299 output_file=evaluation_file,
300 interactive=True,
301 )
302 else:
303 logger.info("Running automated evaluation...")
304 try:
305 evaluation_results = grade_results(
306 results_file=results_file,
307 output_file=evaluation_file,
308 dataset_type=dataset_type,
309 evaluation_config=evaluation_config,
310 progress_callback=lambda current, total, meta: (
311 progress_callback(
312 f"Evaluating {current + 1}/{total}",
313 50 + int((current + 0.5) / total * 40),
314 {**meta, "status": "evaluating"},
315 )
316 if progress_callback
317 else None
318 ),
319 )
320 except Exception as e:
321 logger.exception(f"Automated evaluation failed: {e!s}")
323 if progress_callback:
324 progress_callback(
325 "Automated evaluation failed. Falling back to human evaluation.",
326 60,
327 {"status": "evaluation_fallback", "error": str(e)},
328 )
330 # Ask if user wants to fall back to human evaluation
331 fallback_to_human = False
332 print("\nAutomated evaluation failed with error:", str(e))
333 response = input(
334 "Do you want to fall back to human evaluation? (y/n): "
335 )
336 fallback_to_human = response.strip().lower() == "y"
338 if fallback_to_human:
339 logger.info("Falling back to human evaluation...")
340 from .graders import human_evaluation as evaluate
342 evaluation_results = evaluate(
343 results_file=results_file,
344 output_file=evaluation_file,
345 interactive=True,
346 )
347 else:
348 from ..security.file_write_verifier import (
349 write_file_verified,
350 )
352 logger.info("Skipping evaluation due to error.")
353 # Create an empty evaluation file to prevent issues
354 write_file_verified(
355 evaluation_file,
356 "",
357 "benchmark.allow_file_output",
358 context="empty evaluation placeholder",
359 )
361 return {
362 "status": "evaluation_error",
363 "dataset_type": dataset_type,
364 "results_path": results_file,
365 "evaluation_error": str(e),
366 "total_examples": total_examples,
367 }
369 # Calculate metrics
370 if progress_callback:
371 progress_callback(
372 "Calculating metrics", 90, {"status": "calculating_metrics"}
373 )
375 metrics = calculate_metrics(evaluation_file)
377 # Generate report
378 if progress_callback:
379 progress_callback(
380 "Generating report", 95, {"status": "generating_report"}
381 )
383 dataset_name = dataset_type.capitalize()
384 report_path = generate_report(
385 metrics=metrics,
386 results_file=evaluation_file,
387 output_file=report_file,
388 dataset_name=dataset_name,
389 config_info={
390 "Dataset": dataset_path
391 or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"),
392 "Examples": total_examples,
393 "Iterations": search_config.get("iterations", 3),
394 "Questions per iteration": search_config.get(
395 "questions_per_iteration", 3
396 ),
397 "Search tool": search_config.get("search_tool", "searxng"),
398 "Evaluation method": "Human"
399 if human_evaluation
400 else "Automated",
401 },
402 )
404 # Mark as complete
405 if progress_callback:
406 progress_callback(
407 "Benchmark complete",
408 100,
409 {
410 "status": "complete",
411 "metrics": metrics,
412 "report_path": report_path,
413 },
414 )
416 return {
417 "status": "complete",
418 "dataset_type": dataset_type,
419 "results_path": results_file,
420 "evaluation_path": evaluation_file,
421 "report_path": report_path,
422 "metrics": metrics,
423 "total_examples": total_examples,
424 "accuracy": metrics.get("accuracy", 0),
425 }
427 else:
428 # No evaluation, just return results
429 if progress_callback:
430 progress_callback(
431 "Benchmark complete (no evaluation)",
432 100,
433 {"status": "complete_no_eval", "results_path": results_file},
434 )
436 return {
437 "status": "complete_no_eval",
438 "dataset_type": dataset_type,
439 "results_path": results_file,
440 "total_examples": total_examples,
441 }
444def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:
445 """
446 Run SimpleQA benchmark with default settings.
448 Args:
449 num_examples: Number of examples to process
450 **kwargs: Additional arguments to pass to run_benchmark
452 Returns:
453 Dictionary with benchmark results
454 """
455 return run_benchmark(
456 dataset_type="simpleqa", num_examples=num_examples, **kwargs
457 )
460def run_browsecomp_benchmark(
461 num_examples: int = 100, **kwargs
462) -> Dict[str, Any]:
463 """
464 Run BrowseComp benchmark with default settings.
466 Args:
467 num_examples: Number of examples to process
468 **kwargs: Additional arguments to pass to run_benchmark
470 Returns:
471 Dictionary with benchmark results
472 """
473 return run_benchmark(
474 dataset_type="browsecomp", num_examples=num_examples, **kwargs
475 )
478def run_xbench_deepsearch_benchmark(
479 num_examples: int = 100, **kwargs
480) -> Dict[str, Any]:
481 """
482 Run xbench-DeepSearch benchmark with default settings.
484 Args:
485 num_examples: Number of examples to process
486 **kwargs: Additional arguments to pass to run_benchmark
488 Returns:
489 Dictionary with benchmark results
490 """
491 return run_benchmark(
492 dataset_type="xbench_deepsearch", num_examples=num_examples, **kwargs
493 )