Coverage for src/local_deep_research/benchmarks/runners.py: 51%

1"""

2Benchmark runners for Local Deep Research.

4This module provides the main functions for running benchmarks using LDR.

5"""

7import json

8from loguru import logger

9import os

10from pathlib import Path

11import time

12from typing import Any, Callable, Dict, Optional

14from ..api import quick_summary

15from .datasets import DEFAULT_DATASET_URLS, load_dataset

16from .datasets.base import DatasetRegistry

17from .graders import extract_answer_from_response, grade_results

18from .metrics import calculate_metrics, generate_report

19from .templates import BROWSECOMP_QUERY_TEMPLATE

22def format_query(question: str, dataset_type: str = "simpleqa") -> str:

23 """

24 Format query based on dataset type.

26 Args:

27 question: Original question

28 dataset_type: Type of dataset

30 Returns:

31 Formatted query for LDR

32 """

33 if dataset_type.lower() == "browsecomp":

34 # BrowseComp requires specific formatting

35 return BROWSECOMP_QUERY_TEMPLATE.format(question=question)

37 # Simple format for SimpleQA

38 return question

41def run_benchmark(

42 dataset_type: str,

43 dataset_path: Optional[str] = None,

44 num_examples: Optional[int] = None,

45 output_dir: str = "benchmark_results",

46 run_evaluation: bool = True,

47 evaluation_config: Optional[Dict[str, Any]] = None,

48 search_config: Optional[Dict[str, Any]] = None,

49 human_evaluation: bool = False,

50 progress_callback: Optional[Callable[[str, int, Dict], None]] = None,

51 seed: int = 42,

52) -> Dict[str, Any]:

53 """

54 Run a benchmark on the specified dataset.

56 Args:

57 dataset_type: Type of dataset ("simpleqa" or "browsecomp")

58 dataset_path: Optional custom dataset path

59 num_examples: Number of examples to use

60 output_dir: Directory to save results

61 run_evaluation: Whether to evaluate results

62 evaluation_config: Custom LLM config for evaluation

63 search_config: Custom search parameters

64 human_evaluation: Whether to use human evaluation

65 progress_callback: Optional callback for progress updates

66 seed: Random seed for reproducibility

68 Returns:

69 Dictionary with benchmark results and metrics

70 """

71 # Ensure output directory exists

72 os.makedirs(output_dir, exist_ok=True)

74 # Default search configuration

75 if not search_config:

76 search_config = {

77 "iterations": 3,

78 "questions_per_iteration": 3,

79 "search_tool": "searxng",

80 }

82 # Load dataset using the class-based approach

83 try:

84 # Create the dataset instance from registry

85 dataset_instance = DatasetRegistry.create_dataset(

86 dataset_id=dataset_type.lower(),

87 dataset_path=dataset_path,

88 num_examples=num_examples,

89 seed=seed,

90 )

91 # Load the examples

92 dataset = dataset_instance.load()

94 logger.info(

95 f"Loaded {len(dataset)} examples using dataset class {type(dataset_instance).__name__}"

96 )

97 except Exception as e:

98 # Fallback to legacy function if there's any issue

99 logger.warning(

100 f"Error using dataset class: {e}. Falling back to legacy function."

101 )

102 dataset = load_dataset(

103 dataset_type=dataset_type,

104 dataset_path=dataset_path,

105 num_examples=num_examples,

106 seed=seed,

107 )

108

109 # Set up output files

110 timestamp = time.strftime("%Y%m%d_%H%M%S")

111 results_file = str(

112 Path(output_dir) / f"{dataset_type}_{timestamp}_results.jsonl"

113 )

114 evaluation_file = str(

115 Path(output_dir) / f"{dataset_type}_{timestamp}_evaluation.jsonl"

116 )

117 report_file = str(

118 Path(output_dir) / f"{dataset_type}_{timestamp}_report.md"

119 )

120

121 # Make sure output files don't exist

122 for file in [results_file, evaluation_file, report_file]:

123 file_path = Path(file)

124 if file_path.exists(): 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 file_path.unlink()

126

127 # Progress tracking

128 total_examples = len(dataset)

129

130 if progress_callback:

131 progress_callback(

132 "Starting benchmark",

133 0,

134 {

135 "status": "started",

136 "dataset_type": dataset_type,

137 "total_examples": total_examples,

138 },

139 )

140

141 # Process each example

142 results = []

143

144 for i, example in enumerate(dataset):

145 # Extract question and answer in a way that uses the dataset class when available

146 if "dataset_instance" in locals() and isinstance( 146 ↛ 151line 146 didn't jump to line 151 because the condition on line 146 was never true

147 dataset_instance,

148 DatasetRegistry.get_dataset_class(dataset_type.lower()),

149 ):

150 # Use the dataset class methods to extract question and answer

151 question = dataset_instance.get_question(example)

152 correct_answer = dataset_instance.get_answer(example)

153 logger.debug(

154 "Using dataset class methods to extract question and answer"

155 )

156 else:

157 # Fallback to the legacy approach

158 if dataset_type.lower() == "simpleqa": 158 ↛ 162line 158 didn't jump to line 162 because the condition on line 158 was always true

159 question = example.get("problem", "")

160 correct_answer = example.get("answer", "")

161 else: # browsecomp

162 question = example.get("problem", "")

163 # For BrowseComp, the answer should be in "correct_answer" after decryption

164 correct_answer = example.get("correct_answer", "")

165 if not correct_answer and "answer" in example:

166 # Fallback to "answer" field if "correct_answer" is not available

167 correct_answer = example.get("answer", "")

168

169 # Update progress

170 if progress_callback: 170 ↛ 186line 170 didn't jump to line 186 because the condition on line 170 was always true

171 progress_callback(

172 f"Processing example {i + 1}/{total_examples}",

173 int(i / total_examples * 50),

174 {

175 "status": "processing",

176 "current": i + 1,

177 "total": total_examples,

178 "question": (

179 question[:50] + "..."

180 if len(question) > 50

181 else question

182 ),

183 },

184 )

185

186 logger.info(f"Processing {i + 1}/{total_examples}: {question[:50]}...")

187

188 try:

189 # Format query based on dataset type

190 formatted_query = format_query(question, dataset_type)

191

192 # Time the search

193 start_time = time.time()

194

195 # Get response from LDR

196 search_result = quick_summary(

197 query=formatted_query,

198 iterations=search_config.get("iterations", 3),

199 questions_per_iteration=search_config.get(

200 "questions_per_iteration", 3

201 ),

202 search_tool=search_config.get("search_tool", "searxng"),

203 )

204

205 end_time = time.time()

206 processing_time = end_time - start_time

207

208 # Extract response and search info

209 response = search_result.get("summary", "")

210

211 # Extract structured information

212 extracted = extract_answer_from_response(response, dataset_type)

213

214 # Format result

215 result = {

216 "id": example.get("id", f"example_{i}"),

217 "problem": question,

218 "correct_answer": correct_answer,

219 "response": response,

220 "extracted_answer": extracted["extracted_answer"],

221 "confidence": extracted["confidence"],

222 "processing_time": processing_time,

223 "sources": search_result.get("sources", []),

224 "search_config": search_config,

225 }

226

227 # Add to results list

228 results.append(result)

229

230 # Write result to file

231 with open(results_file, "a") as f:

232 f.write(json.dumps(result) + "\n")

233

234 # Update progress

235 if progress_callback: 235 ↛ 144line 235 didn't jump to line 144 because the condition on line 235 was always true

236 progress_callback(

237 f"Completed example {i + 1}/{total_examples}",

238 int((i + 0.5) / total_examples * 50),

239 {

240 "status": "completed_example",

241 "current": i + 1,

242 "total": total_examples,

243 "result": result,

244 },

245 )

246

247 except Exception as e:

248 logger.exception(f"Error processing example {i + 1}")

249

250 # Create error result

251 error_result = {

252 "id": example.get("id", f"example_{i}"),

253 "problem": question,

254 "correct_answer": correct_answer,

255 "error": str(e),

256 "processing_time": (

257 time.time() - start_time if "start_time" in locals() else 0

258 ),

259 }

260

261 # Add to results list

262 results.append(error_result)

263

264 # Write error result to file

265 with open(results_file, "a") as f:

266 f.write(json.dumps(error_result) + "\n")

267

268 # Update progress

269 if progress_callback:

270 progress_callback(

271 f"Error processing example {i + 1}/{total_examples}",

272 int((i + 0.5) / total_examples * 50),

273 {

274 "status": "error",

275 "current": i + 1,

276 "total": total_examples,

277 "error": str(e),

278 "result": error_result,

279 },

280 )

281

282 logger.info(f"Completed processing {total_examples} examples")

283

284 # Run evaluation if requested

285 if run_evaluation: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 if progress_callback:

287 progress_callback(

288 "Starting evaluation",

289 50,

290 {"status": "evaluating", "results_file": results_file},

291 )

292

293 if human_evaluation:

294 from .graders import human_evaluation as evaluate

295

296 logger.info("Running human evaluation...")

297 evaluation_results = evaluate(

298 results_file=results_file,

299 output_file=evaluation_file,

300 interactive=True,

301 )

302 else:

303 logger.info("Running automated evaluation...")

304 try:

305 evaluation_results = grade_results(

306 results_file=results_file,

307 output_file=evaluation_file,

308 dataset_type=dataset_type,

309 evaluation_config=evaluation_config,

310 progress_callback=lambda current, total, meta: (

311 progress_callback(

312 f"Evaluating {current + 1}/{total}",

313 50 + int((current + 0.5) / total * 40),

314 {**meta, "status": "evaluating"},

315 )

316 if progress_callback

317 else None

318 ),

319 )

320 except Exception as e:

321 logger.exception("Automated evaluation failed")

322

323 if progress_callback:

324 progress_callback(

325 "Automated evaluation failed. Falling back to human evaluation.",

326 60,

327 {"status": "evaluation_fallback", "error": str(e)},

328 )

329

330 # Ask if user wants to fall back to human evaluation

331 fallback_to_human = False

332 print("\nAutomated evaluation failed with error:", str(e))

333 response = input(

334 "Do you want to fall back to human evaluation? (y/n): "

335 )

336 fallback_to_human = response.strip().lower() == "y"

337

338 if fallback_to_human:

339 logger.info("Falling back to human evaluation...")

340 from .graders import human_evaluation as evaluate

341

342 evaluation_results = evaluate(

343 results_file=results_file,

344 output_file=evaluation_file,

345 interactive=True,

346 )

347 else:

348 from ..security.file_write_verifier import (

349 write_file_verified,

350 )

351

352 logger.info("Skipping evaluation due to error.")

353 # Create an empty evaluation file to prevent issues

354 write_file_verified(

355 evaluation_file,

356 "",

357 "benchmark.allow_file_output",

358 context="empty evaluation placeholder",

359 )

360

361 return {

362 "status": "evaluation_error",

363 "dataset_type": dataset_type,

364 "results_path": results_file,

365 "evaluation_error": str(e),

366 "total_examples": total_examples,

367 }

368

369 # Calculate metrics

370 if progress_callback:

371 progress_callback(

372 "Calculating metrics", 90, {"status": "calculating_metrics"}

373 )

374

375 metrics = calculate_metrics(evaluation_file)

376

377 # Generate report

378 if progress_callback:

379 progress_callback(

380 "Generating report", 95, {"status": "generating_report"}

381 )

382

383 dataset_name = dataset_type.capitalize()

384 report_path = generate_report(

385 metrics=metrics,

386 results_file=evaluation_file,

387 output_file=report_file,

388 dataset_name=dataset_name,

389 config_info={

390 "Dataset": dataset_path

391 or DEFAULT_DATASET_URLS.get(dataset_type, "Unknown"),

392 "Examples": total_examples,

393 "Iterations": search_config.get("iterations", 3),

394 "Questions per iteration": search_config.get(

395 "questions_per_iteration", 3

396 ),

397 "Search tool": search_config.get("search_tool", "searxng"),

398 "Evaluation method": "Human"

399 if human_evaluation

400 else "Automated",

401 },

402 )

403

404 # Mark as complete

405 if progress_callback:

406 progress_callback(

407 "Benchmark complete",

408 100,

409 {

410 "status": "complete",

411 "metrics": metrics,

412 "report_path": report_path,

413 },

414 )

415

416 return {

417 "status": "complete",

418 "dataset_type": dataset_type,

419 "results_path": results_file,

420 "evaluation_path": evaluation_file,

421 "report_path": report_path,

422 "metrics": metrics,

423 "total_examples": total_examples,

424 "accuracy": metrics.get("accuracy", 0),

425 }

426

427 else:

428 # No evaluation, just return results

429 if progress_callback:

430 progress_callback(

431 "Benchmark complete (no evaluation)",

432 100,

433 {"status": "complete_no_eval", "results_path": results_file},

434 )

435

436 return {

437 "status": "complete_no_eval",

438 "dataset_type": dataset_type,

439 "results_path": results_file,

440 "total_examples": total_examples,

441 }

442

443

444def run_simpleqa_benchmark(num_examples: int = 100, **kwargs) -> Dict[str, Any]:

445 """

446 Run SimpleQA benchmark with default settings.

447

448 Args:

449 num_examples: Number of examples to process

450 **kwargs: Additional arguments to pass to run_benchmark

451

452 Returns:

453 Dictionary with benchmark results

454 """

455 return run_benchmark(

456 dataset_type="simpleqa", num_examples=num_examples, **kwargs

457 )

458

459

460def run_browsecomp_benchmark(

461 num_examples: int = 100, **kwargs

462) -> Dict[str, Any]:

463 """

464 Run BrowseComp benchmark with default settings.

465

466 Args:

467 num_examples: Number of examples to process

468 **kwargs: Additional arguments to pass to run_benchmark

469

470 Returns:

471 Dictionary with benchmark results

472 """

473 return run_benchmark(

474 dataset_type="browsecomp", num_examples=num_examples, **kwargs

475 )

476

477

478def run_xbench_deepsearch_benchmark(

479 num_examples: int = 100, **kwargs

480) -> Dict[str, Any]:

481 """

482 Run xbench-DeepSearch benchmark with default settings.

483

484 Args:

485 num_examples: Number of examples to process

486 **kwargs: Additional arguments to pass to run_benchmark

487

488 Returns:

489 Dictionary with benchmark results

490 """

491 return run_benchmark(

492 dataset_type="xbench_deepsearch", num_examples=num_examples, **kwargs

493 )

Coverage for src / local_deep_research / benchmarks / runners.py: 51%

122 statements