Coverage for src / local_deep_research / benchmarks / runners.py: 10%

122 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Benchmark runners for Local Deep Research. 

3 

4This module provides the main functions for running benchmarks using LDR. 

5""" 

6 

7import json 

8from loguru import logger 

9import os 

10from pathlib import Path 

11import time 

12from typing import Any, Callable, Dict, Optional 

13 

14from ..api import quick_summary 

15from .datasets import DEFAULT_DATASET_URLS, load_dataset 

16from .datasets.base import DatasetRegistry 

17from .graders import extract_answer_from_response, grade_results 

18from .metrics import calculate_metrics, generate_report 

19from .templates import BROWSECOMP_QUERY_TEMPLATE 

20 

21 

22def format_query(question: str, dataset_type: str = "simpleqa") -> str: 

23 """ 

24 Format query based on dataset type. 

25 

26 Args: 

27 question: Original question 

28 dataset_type: Type of dataset 

29 

30 Returns: 

31 Formatted query for LDR 

32 """ 

33 if dataset_type.lower() == "browsecomp": 

34 # BrowseComp requires specific formatting 

35 return BROWSECOMP_QUERY_TEMPLATE.format(question=question) 

36 

37 # Simple format for SimpleQA 

38 return question 

39 

40 

41def run_benchmark( 

42 dataset_type: str, 

43 dataset_path: Optional[str] = None, 

44 num_examples: Optional[int] = None, 

45 output_dir: str = "benchmark_results", 

46 run_evaluation: bool = True, 

47 evaluation_config: Optional[Dict[str, Any]] = None, 

48 search_config: Optional[Dict[str, Any]] = None, 

49 human_evaluation: bool = False, 

50 progress_callback: Optional[Callable[[str, int, Dict], None]] = None, 

51 seed: int = 42, 

52) -> Dict[str, Any]: 

53 """ 

54 Run a benchmark on the specified dataset. 

55 

56 Args: 

57 dataset_type: Type of dataset ("simpleqa" or "browsecomp") 

58 dataset_path: Optional custom dataset path 

59 num_examples: Number of examples to use 

60 output_dir: Directory to save results 

61 run_evaluation: Whether to evaluate results 

62 evaluation_config: Custom LLM config for evaluation 

63 search_config: Custom search parameters 

64 human_evaluation: Whether to use human evaluation 

65 progress_callback: Optional callback for progress updates 

66 seed: Random seed for reproducibility 

67 

68 Returns: 

69 Dictionary with benchmark results and metrics 

70 """ 

71 # Ensure output directory exists 

72 os.makedirs(output_dir, exist_ok=True) 

73 

74 # Default search configuration 

75 if not search_config: 

76 search_config = { 

77 "iterations": 3, 

78 "questions_per_iteration": 3, 

79 "search_tool": "searxng", 

80 } 

81 

82 # Load dataset using the class-based approach 

83 try: 

84 # Create the dataset instance from registry 

85 dataset_instance = DatasetRegistry.create_dataset( 

86 dataset_id=dataset_type.lower(), 

87 dataset_path=dataset_path, 

88 num_examples=num_examples, 

89 seed=seed, 

90 ) 

91 # Load the examples 

92 dataset = dataset_instance.load() 

93 

94 logger.info( 

95 f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}" 

96 ) 

97 except Exception as e: 

98 # Fallback to legacy function if there's any issue 

99 logger.warning( 

100 f"Error using dataset class: {e}. Falling back to legacy function." 

101 ) 

102 dataset = load_dataset( 

103 dataset_type=dataset_type, 

104 dataset_path=dataset_path, 

105 num_examples=num_examples, 

106 seed=seed, 

107 ) 

108 

109 # Set up output files 

110 timestamp = time.strftime("%Y%m%d_%H%M%S") 

111 results_file = str( 

112 Path(output_dir) / f"{dataset_type}_{timestamp}_results.jsonl" 

113 ) 

114 evaluation_file = str( 

115 Path(output_dir) / f"{dataset_type}_{timestamp}_evaluation.jsonl" 

116 ) 

117 report_file = str( 

118 Path(output_dir) / f"{dataset_type}_{timestamp}_report.md" 

119 ) 

120 

121 # Make sure output files don't exist 

122 for file in [results_file, evaluation_file, report_file]: 

123 file_path = Path(file) 

124 if file_path.exists(): 

125 file_path.unlink() 

126 

127 # Progress tracking 

128 total_examples = len(dataset) 

129 

130 if progress_callback: 

131 progress_callback( 

132 "Starting benchmark", 

133 0, 

134 { 

135 "status": "started", 

136 "dataset_type": dataset_type, 

137 "total_examples": total_examples, 

138 }, 

139 ) 

140 

141 # Process each example 

142 results = [] 

143 

144 for i, example in enumerate(dataset): 

145 # Extract question and answer in a way that uses the dataset class when available 

146 if "dataset_instance" in locals() and isinstance( 

147 dataset_instance, 

148 DatasetRegistry.get_dataset_class(dataset_type.lower()), 

149 ): 

150 # Use the dataset class methods to extract question and answer 

151 question = dataset_instance.get_question(example) 

152 correct_answer = dataset_instance.get_answer(example) 

153 logger.debug( 

154 "Using dataset class methods to extract question and answer" 

155 ) 

156 else: 

157 # Fallback to the legacy approach 

158 if dataset_type.lower() == "simpleqa": 

159 question = example.get("problem", "") 

160 correct_answer = example.get("answer", "") 

161 else: # browsecomp 

162 question = example.get("problem", "") 

163 # For BrowseComp, the answer should be in "correct_answer" after decryption 

164 correct_answer = example.get("correct_answer", "") 

165 if not correct_answer and "answer" in example: 

166 # Fallback to "answer" field if "correct_answer" is not available 

167 correct_answer = example.get("answer", "") 

168 

169 # Update progress 

170 if progress_callback: 

171 progress_callback( 

172 f"Processing example {i + 1}/{total_examples}", 

173 int(i / total_examples * 50), 

174 { 

175 "status": "processing", 

176 "current": i + 1, 

177 "total": total_examples, 

178 "question": ( 

179 question[:50] + "..." 

180 if len(question) > 50 

181 else question 

182 ), 

183 }, 

184 ) 

185 

186 logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...") 

187 

188 try: 

189 # Format query based on dataset type 

190 formatted_query = format_query(question, dataset_type) 

191 

192 # Time the search 

193 start_time = time.time() 

194 

195 # Get response from LDR 

196 search_result = quick_summary( 

197 query=formatted_query, 

198 iterations=search_config.get("iterations", 3), 

199 questions_per_iteration=search_config.get( 

200 "questions_per_iteration", 3 

201 ), 

202 search_tool=search_config.get("search_tool", "searxng"), 

203 ) 

204 

205 end_time = time.time() 

206 processing_time = end_time - start_time 

207 

208 # Extract response and search info 

209 response = search_result.get("summary", "") 

210 

211 # Extract structured information 

212 extracted = extract_answer_from_response(response, dataset_type) 

213 

214 # Format result 

215 result = { 

216 "id": example.get("id", f"example_{i}"), 

217 "problem": question, 

218 "correct_answer": correct_answer, 

219 "response": response, 

220 "extracted_answer": extracted["extracted_answer"], 

221 "confidence": extracted["confidence"], 

222 "processing_time": processing_time, 

223 "sources": search_result.get("sources", []), 

224 "search_config": search_config, 

225 } 

226 

227 # Add to results list 

228 results.append(result) 

229 

230 # Write result to file 

231 with open(results_file, "a") as f: 

232 f.write(json.dumps(result) + "\n") 

233 

234 # Update progress 

235 if progress_callback: 

236 progress_callback( 

237 f"Completed example {i + 1}/{total_examples}", 

238 int((i + 0.5) / total_examples * 50), 

239 { 

240 "status": "completed_example", 

241 "current": i + 1, 

242 "total": total_examples, 

243 "result": result, 

244 }, 

245 ) 

246 

247 except Exception as e: 

248 logger.exception(f"Error processing example {i + 1}: {e!s}") 

249 

250 # Create error result 

251 error_result = { 

252 "id": example.get("id", f"example_{i}"), 

253 "problem": question, 

254 "correct_answer": correct_answer, 

255 "error": str(e), 

256 "processing_time": ( 

257 time.time() - start_time if "start_time" in locals() else 0 

258 ), 

259 } 

260 

261 # Add to results list 

262 results.append(error_result) 

263 

264 # Write error result to file 

265 with open(results_file, "a") as f: 

266 f.write(json.dumps(error_result) + "\n") 

267 

268 # Update progress 

269 if progress_callback: 

270 progress_callback( 

271 f"Error processing example {i + 1}/{total_examples}", 

272 int((i + 0.5) / total_examples * 50), 

273 { 

274 "status": "error", 

275 "current": i + 1, 

276 "total": total_examples, 

277 "error": str(e), 

278 "result": error_result, 

279 }, 

280 ) 

281 

282 logger.info(f"Completed processing {total_examples} examples") 

283 

284 # Run evaluation if requested 

285 if run_evaluation: 

286 if progress_callback: 

287 progress_callback( 

288 "Starting evaluation", 

289 50, 

290 {"status": "evaluating", "results_file": results_file}, 

291 ) 

292 

293 if human_evaluation: 

294 from .graders import human_evaluation as evaluate 

295 

296 logger.info("Running human evaluation...") 

297 evaluation_results = evaluate( 

298 results_file=results_file, 

299 output_file=evaluation_file, 

300 interactive=True, 

301 ) 

302 else: 

303 logger.info("Running automated evaluation...") 

304 try: 

305 evaluation_results = grade_results( 

306 results_file=results_file, 

307 output_file=evaluation_file, 

308 dataset_type=dataset_type, 

309 evaluation_config=evaluation_config, 

310 progress_callback=lambda current, total, meta: ( 

311 progress_callback( 

312 f"Evaluating {current + 1}/{total}", 

313 50 + int((current + 0.5) / total * 40), 

314 {**meta, "status": "evaluating"}, 

315 ) 

316 if progress_callback 

317 else None 

318 ), 

319 ) 

320 except Exception as e: 

321 logger.exception(f"Automated evaluation failed: {e!s}") 

322 

323 if progress_callback: 

324 progress_callback( 

325 "Automated evaluation failed. Falling back to human evaluation.", 

326 60, 

327 {"status": "evaluation_fallback", "error": str(e)}, 

328 ) 

329 

330 # Ask if user wants to fall back to human evaluation 

331 fallback_to_human = False 

332 print("\nAutomated evaluation failed with error:", str(e)) 

333 response = input( 

334 "Do you want to fall back to human evaluation? (y/n): " 

335 ) 

336 fallback_to_human = response.strip().lower() == "y" 

337 

338 if fallback_to_human: 

339 logger.info("Falling back to human evaluation...") 

340 from .graders import human_evaluation as evaluate 

341 

342 evaluation_results = evaluate( 

343 results_file=results_file, 

344 output_file=evaluation_file, 

345 interactive=True, 

346 ) 

347 else: 

348 from ..security.file_write_verifier import ( 

349 write_file_verified, 

350 ) 

351 

352 logger.info("Skipping evaluation due to error.") 

353 # Create an empty evaluation file to prevent issues 

354 write_file_verified( 

355 evaluation_file, 

356 "", 

357 "benchmark.allow_file_output", 

358 context="empty evaluation placeholder", 

359 ) 

360 

361 return { 

362 "status": "evaluation_error", 

363 "dataset_type": dataset_type, 

364 "results_path": results_file, 

365 "evaluation_error": str(e), 

366 "total_examples": total_examples, 

367 } 

368 

369 # Calculate metrics 

370 if progress_callback: 

371 progress_callback( 

372 "Calculating metrics", 90, {"status": "calculating_metrics"} 

373 ) 

374 

375 metrics = calculate_metrics(evaluation_file) 

376 

377 # Generate report 

378 if progress_callback: 

379 progress_callback( 

380 "Generating report", 95, {"status": "generating_report"} 

381 ) 

382 

383 dataset_name = dataset_type.capitalize() 

384 report_path = generate_report( 

385 metrics=metrics, 

386 results_file=evaluation_file, 

387 output_file=report_file, 

388 dataset_name=dataset_name, 

389 config_info={ 

390 "Dataset": dataset_path 

391 or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"), 

392 "Examples": total_examples, 

393 "Iterations": search_config.get("iterations", 3), 

394 "Questions per iteration": search_config.get( 

395 "questions_per_iteration", 3 

396 ), 

397 "Search tool": search_config.get("search_tool", "searxng"), 

398 "Evaluation method": "Human" 

399 if human_evaluation 

400 else "Automated", 

401 }, 

402 ) 

403 

404 # Mark as complete 

405 if progress_callback: 

406 progress_callback( 

407 "Benchmark complete", 

408 100, 

409 { 

410 "status": "complete", 

411 "metrics": metrics, 

412 "report_path": report_path, 

413 }, 

414 ) 

415 

416 return { 

417 "status": "complete", 

418 "dataset_type": dataset_type, 

419 "results_path": results_file, 

420 "evaluation_path": evaluation_file, 

421 "report_path": report_path, 

422 "metrics": metrics, 

423 "total_examples": total_examples, 

424 "accuracy": metrics.get("accuracy", 0), 

425 } 

426 

427 else: 

428 # No evaluation, just return results 

429 if progress_callback: 

430 progress_callback( 

431 "Benchmark complete (no evaluation)", 

432 100, 

433 {"status": "complete_no_eval", "results_path": results_file}, 

434 ) 

435 

436 return { 

437 "status": "complete_no_eval", 

438 "dataset_type": dataset_type, 

439 "results_path": results_file, 

440 "total_examples": total_examples, 

441 } 

442 

443 

444def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]: 

445 """ 

446 Run SimpleQA benchmark with default settings. 

447 

448 Args: 

449 num_examples: Number of examples to process 

450 **kwargs: Additional arguments to pass to run_benchmark 

451 

452 Returns: 

453 Dictionary with benchmark results 

454 """ 

455 return run_benchmark( 

456 dataset_type="simpleqa", num_examples=num_examples, **kwargs 

457 ) 

458 

459 

460def run_browsecomp_benchmark( 

461 num_examples: int = 100, **kwargs 

462) -> Dict[str, Any]: 

463 """ 

464 Run BrowseComp benchmark with default settings. 

465 

466 Args: 

467 num_examples: Number of examples to process 

468 **kwargs: Additional arguments to pass to run_benchmark 

469 

470 Returns: 

471 Dictionary with benchmark results 

472 """ 

473 return run_benchmark( 

474 dataset_type="browsecomp", num_examples=num_examples, **kwargs 

475 ) 

476 

477 

478def run_xbench_deepsearch_benchmark( 

479 num_examples: int = 100, **kwargs 

480) -> Dict[str, Any]: 

481 """ 

482 Run xbench-DeepSearch benchmark with default settings. 

483 

484 Args: 

485 num_examples: Number of examples to process 

486 **kwargs: Additional arguments to pass to run_benchmark 

487 

488 Returns: 

489 Dictionary with benchmark results 

490 """ 

491 return run_benchmark( 

492 dataset_type="xbench_deepsearch", num_examples=num_examples, **kwargs 

493 )