Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 27%

336 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1"""Flask routes for benchmark web interface.""" 

2 

3import time 

4 

5from flask import Blueprint, jsonify, request 

6from loguru import logger 

7 

8from ...database.session_context import get_user_db_session 

9from ...web.auth.decorators import login_required 

10from ...web.utils.rate_limiter import limiter 

11from local_deep_research.settings import SettingsManager 

12from ...web.utils.templates import render_template_with_defaults 

13from .benchmark_service import benchmark_service 

14 

15# Create blueprint for benchmark routes 

16benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark") 

17 

18 

19@benchmark_bp.route("/") 

20@login_required 

21def index(): 

22 """Benchmark dashboard page.""" 

23 from flask import session as flask_session 

24 

25 username = flask_session.get("username") 

26 with get_user_db_session(username) as db_session: 

27 settings_manager = SettingsManager(db_session) 

28 

29 # Load evaluation settings from database 

30 eval_settings = { 

31 "evaluation_provider": settings_manager.get_setting( 

32 "benchmark.evaluation.provider", "openai_endpoint" 

33 ), 

34 "evaluation_model": settings_manager.get_setting( 

35 "benchmark.evaluation.model", "" 

36 ), 

37 "evaluation_endpoint_url": settings_manager.get_setting( 

38 "benchmark.evaluation.endpoint_url", "" 

39 ), 

40 "evaluation_temperature": settings_manager.get_setting( 

41 "benchmark.evaluation.temperature", 0 

42 ), 

43 } 

44 

45 return render_template_with_defaults( 

46 "pages/benchmark.html", eval_settings=eval_settings 

47 ) 

48 

49 

50@benchmark_bp.route("/results") 

51@login_required 

52def results(): 

53 """Benchmark results history page.""" 

54 return render_template_with_defaults("pages/benchmark_results.html") 

55 

56 

57@benchmark_bp.route("/api/start", methods=["POST"]) 

58@login_required 

59def start_benchmark(): 

60 """Start a new benchmark run.""" 

61 try: 

62 data = request.get_json() 

63 

64 if not data: 

65 return jsonify({"error": "No data provided"}), 400 

66 

67 # Extract configuration 

68 run_name = data.get("run_name") 

69 

70 # Get search config from database instead of request 

71 from ...database.session_context import get_user_db_session 

72 from local_deep_research.settings import SettingsManager 

73 from flask import session as flask_session 

74 

75 username = flask_session.get("username") 

76 session_id = flask_session.get("session_id") 

77 

78 # Try to get password from session store for background thread 

79 from ...database.session_passwords import session_password_store 

80 

81 user_password = None 

82 if session_id: 

83 user_password = session_password_store.get_session_password( 

84 username, session_id 

85 ) 

86 

87 search_config = {} 

88 evaluation_config = {} 

89 datasets_config = data.get("datasets_config", {}) 

90 

91 with get_user_db_session(username) as db_session: 

92 # Use the logged-in user's settings 

93 settings_manager = SettingsManager(db_session) 

94 

95 # Build search config from database settings 

96 search_config = { 

97 "iterations": int( 

98 settings_manager.get_setting("search.iterations", 8) 

99 ), 

100 "questions_per_iteration": int( 

101 settings_manager.get_setting( 

102 "search.questions_per_iteration", 5 

103 ) 

104 ), 

105 "search_tool": settings_manager.get_setting( 

106 "search.tool", "searxng" 

107 ), 

108 "search_strategy": settings_manager.get_setting( 

109 "search.search_strategy", "focused_iteration" 

110 ), 

111 "model_name": settings_manager.get_setting("llm.model"), 

112 "provider": settings_manager.get_setting("llm.provider"), 

113 "temperature": float( 

114 settings_manager.get_setting("llm.temperature", 0.7) 

115 ), 

116 } 

117 

118 # Add provider-specific settings 

119 provider = search_config.get("provider") 

120 if provider == "openai_endpoint": 

121 search_config["openai_endpoint_url"] = ( 

122 settings_manager.get_setting("llm.openai_endpoint.url") 

123 ) 

124 search_config["openai_endpoint_api_key"] = ( 

125 settings_manager.get_setting("llm.openai_endpoint.api_key") 

126 ) 

127 elif provider == "openai": 

128 search_config["openai_api_key"] = settings_manager.get_setting( 

129 "llm.openai.api_key" 

130 ) 

131 elif provider == "anthropic": 

132 search_config["anthropic_api_key"] = ( 

133 settings_manager.get_setting("llm.anthropic.api_key") 

134 ) 

135 

136 # Get evaluation config from database settings or request 

137 if "evaluation_config" in data: 

138 evaluation_config = data["evaluation_config"] 

139 else: 

140 # Read evaluation config from database settings 

141 evaluation_provider = settings_manager.get_setting( 

142 "benchmark.evaluation.provider", "openai_endpoint" 

143 ) 

144 evaluation_model = settings_manager.get_setting( 

145 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

146 ) 

147 evaluation_temperature = float( 

148 settings_manager.get_setting( 

149 "benchmark.evaluation.temperature", 0 

150 ) 

151 ) 

152 

153 evaluation_config = { 

154 "provider": evaluation_provider, 

155 "model_name": evaluation_model, 

156 "temperature": evaluation_temperature, 

157 } 

158 

159 # Add provider-specific settings for evaluation 

160 if evaluation_provider == "openai_endpoint": 

161 evaluation_config["openai_endpoint_url"] = ( 

162 settings_manager.get_setting( 

163 "benchmark.evaluation.endpoint_url", 

164 "https://openrouter.ai/api/v1", 

165 ) 

166 ) 

167 evaluation_config["openai_endpoint_api_key"] = ( 

168 settings_manager.get_setting( 

169 "llm.openai_endpoint.api_key" 

170 ) 

171 ) 

172 elif evaluation_provider == "openai": 

173 evaluation_config["openai_api_key"] = ( 

174 settings_manager.get_setting("llm.openai.api_key") 

175 ) 

176 elif evaluation_provider == "anthropic": 

177 evaluation_config["anthropic_api_key"] = ( 

178 settings_manager.get_setting("llm.anthropic.api_key") 

179 ) 

180 

181 # Validate datasets config 

182 if not datasets_config or not any( 

183 config.get("count", 0) > 0 for config in datasets_config.values() 

184 ): 

185 return jsonify( 

186 { 

187 "error": "At least one dataset with count > 0 must be specified" 

188 } 

189 ), 400 

190 

191 # Create benchmark run 

192 benchmark_run_id = benchmark_service.create_benchmark_run( 

193 run_name=run_name, 

194 search_config=search_config, 

195 evaluation_config=evaluation_config, 

196 datasets_config=datasets_config, 

197 username=username, 

198 user_password=user_password, 

199 ) 

200 

201 # Start benchmark 

202 success = benchmark_service.start_benchmark( 

203 benchmark_run_id, username, user_password 

204 ) 

205 

206 if success: 

207 return jsonify( 

208 { 

209 "success": True, 

210 "benchmark_run_id": benchmark_run_id, 

211 "message": "Benchmark started successfully", 

212 } 

213 ) 

214 else: 

215 return jsonify( 

216 {"success": False, "error": "Failed to start benchmark"} 

217 ), 500 

218 

219 except Exception: 

220 logger.exception("Error starting benchmark") 

221 return jsonify( 

222 {"success": False, "error": "An internal error has occurred."} 

223 ), 500 

224 

225 

226@benchmark_bp.route("/api/running", methods=["GET"]) 

227@login_required 

228def get_running_benchmark(): 

229 """Check if there's a running benchmark and return its ID.""" 

230 try: 

231 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus 

232 from ...database.session_context import get_user_db_session 

233 from flask import session as flask_session 

234 

235 username = flask_session.get("username") 

236 with get_user_db_session(username) as session: 

237 # Find any benchmark that's currently running 

238 running_benchmark = ( 

239 session.query(BenchmarkRun) 

240 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS) 

241 .order_by(BenchmarkRun.created_at.desc()) 

242 .first() 

243 ) 

244 

245 if running_benchmark: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 return jsonify( 

247 { 

248 "success": True, 

249 "benchmark_run_id": running_benchmark.id, 

250 "run_name": running_benchmark.run_name, 

251 "total_examples": running_benchmark.total_examples, 

252 "completed_examples": running_benchmark.completed_examples, 

253 } 

254 ) 

255 else: 

256 return jsonify( 

257 {"success": False, "message": "No running benchmark found"} 

258 ) 

259 

260 except Exception: 

261 logger.exception("Error checking for running benchmark") 

262 return jsonify( 

263 {"success": False, "error": "An internal error has occurred."} 

264 ), 500 

265 

266 

267@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"]) 

268@limiter.exempt 

269@login_required 

270def get_benchmark_status(benchmark_run_id: int): 

271 """Get status of a benchmark run.""" 

272 try: 

273 from flask import session as flask_session 

274 

275 username = flask_session.get("username") 

276 status = benchmark_service.get_benchmark_status( 

277 benchmark_run_id, username 

278 ) 

279 

280 if status: 

281 logger.info( 

282 f"Returning status for benchmark {benchmark_run_id}: " 

283 f"completed={status.get('completed_examples')}, " 

284 f"overall_acc={status.get('overall_accuracy')}, " 

285 f"avg_time={status.get('avg_time_per_example')}, " 

286 f"estimated_remaining={status.get('estimated_time_remaining')}" 

287 ) 

288 return jsonify({"success": True, "status": status}) 

289 else: 

290 return jsonify( 

291 {"success": False, "error": "Benchmark run not found"} 

292 ), 404 

293 

294 except Exception: 

295 logger.exception("Error getting benchmark status") 

296 return jsonify( 

297 {"success": False, "error": "An internal error has occurred."} 

298 ), 500 

299 

300 

301@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"]) 

302@login_required 

303def cancel_benchmark(benchmark_run_id: int): 

304 """Cancel a running benchmark.""" 

305 try: 

306 from flask import session as flask_session 

307 

308 username = flask_session.get("username") 

309 success = benchmark_service.cancel_benchmark(benchmark_run_id, username) 

310 

311 if success: 

312 return jsonify( 

313 {"success": True, "message": "Benchmark cancelled successfully"} 

314 ) 

315 else: 

316 return jsonify( 

317 {"success": False, "error": "Failed to cancel benchmark"} 

318 ), 500 

319 

320 except Exception: 

321 logger.exception("Error cancelling benchmark") 

322 return jsonify( 

323 {"success": False, "error": "An internal error has occurred."} 

324 ), 500 

325 

326 

327@benchmark_bp.route("/api/history", methods=["GET"]) 

328@login_required 

329def get_benchmark_history(): 

330 """Get list of recent benchmark runs.""" 

331 try: 

332 from ...database.models.benchmark import BenchmarkRun 

333 from ...database.session_context import get_user_db_session 

334 from flask import session as flask_session 

335 

336 username = flask_session.get("username") 

337 with get_user_db_session(username) as session: 

338 # Get all benchmark runs (completed, failed, cancelled, or in-progress) 

339 runs = ( 

340 session.query(BenchmarkRun) 

341 .order_by(BenchmarkRun.created_at.desc()) 

342 .limit(50) 

343 .all() 

344 ) 

345 

346 # Format runs for display 

347 formatted_runs = [] 

348 for run in runs: 348 ↛ 350line 348 didn't jump to line 350 because the loop on line 348 never started

349 # Calculate average processing time from results 

350 avg_processing_time = None 

351 avg_search_results = None 

352 try: 

353 from sqlalchemy import func 

354 

355 from ...database.models.benchmark import BenchmarkResult 

356 

357 avg_result = ( 

358 session.query(func.avg(BenchmarkResult.processing_time)) 

359 .filter( 

360 BenchmarkResult.benchmark_run_id == run.id, 

361 BenchmarkResult.processing_time.isnot(None), 

362 BenchmarkResult.processing_time > 0, 

363 ) 

364 .scalar() 

365 ) 

366 

367 if avg_result: 

368 avg_processing_time = float(avg_result) 

369 except Exception as e: 

370 logger.warning( 

371 f"Error calculating avg processing time for run {run.id}: {e}" 

372 ) 

373 

374 # Calculate average search results and total search requests from metrics 

375 total_search_requests = None 

376 try: 

377 from ...database.models import SearchCall 

378 from ...metrics.search_tracker import get_search_tracker 

379 

380 # Get all results for this run to find research_ids 

381 results = ( 

382 session.query(BenchmarkResult) 

383 .filter(BenchmarkResult.benchmark_run_id == run.id) 

384 .all() 

385 ) 

386 

387 research_ids = [ 

388 r.research_id for r in results if r.research_id 

389 ] 

390 

391 if research_ids: 

392 tracker = get_search_tracker() 

393 with tracker.db.get_session() as metric_session: 

394 # Get all search calls for these research_ids 

395 search_calls = ( 

396 metric_session.query(SearchCall) 

397 .filter( 

398 SearchCall.research_id.in_(research_ids) 

399 ) 

400 .all() 

401 ) 

402 

403 # Group by research_id and calculate metrics per research session 

404 research_results = {} 

405 research_requests = {} 

406 

407 for call in search_calls: 

408 if call.research_id: 

409 if call.research_id not in research_results: 

410 research_results[call.research_id] = 0 

411 research_requests[call.research_id] = 0 

412 research_results[call.research_id] += ( 

413 call.results_count or 0 

414 ) 

415 research_requests[call.research_id] += 1 

416 

417 # Calculate averages across research sessions 

418 if research_results: 

419 total_results = sum(research_results.values()) 

420 avg_search_results = total_results / len( 

421 research_results 

422 ) 

423 

424 total_requests = sum(research_requests.values()) 

425 total_search_requests = total_requests / len( 

426 research_requests 

427 ) 

428 

429 except Exception as e: 

430 logger.warning( 

431 f"Error calculating search metrics for run {run.id}: {e}" 

432 ) 

433 

434 formatted_runs.append( 

435 { 

436 "id": run.id, 

437 "run_name": run.run_name or f"Benchmark #{run.id}", 

438 "created_at": run.created_at.isoformat(), 

439 "total_examples": run.total_examples, 

440 "completed_examples": run.completed_examples, 

441 "overall_accuracy": run.overall_accuracy, 

442 "status": run.status.value, 

443 "search_config": run.search_config, 

444 "evaluation_config": run.evaluation_config, 

445 "datasets_config": run.datasets_config, 

446 "avg_processing_time": avg_processing_time, 

447 "avg_search_results": avg_search_results, 

448 "total_search_requests": total_search_requests, 

449 } 

450 ) 

451 

452 return jsonify({"success": True, "runs": formatted_runs}) 

453 

454 except Exception: 

455 logger.exception("Error getting benchmark history") 

456 return jsonify( 

457 {"success": False, "error": "An internal error has occurred."} 

458 ), 500 

459 

460 

461@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"]) 

462@limiter.exempt 

463@login_required 

464def get_benchmark_results(benchmark_run_id: int): 

465 """Get detailed results for a benchmark run.""" 

466 try: 

467 from ...database.models.benchmark import BenchmarkResult 

468 from ...database.session_context import get_user_db_session 

469 from flask import session as flask_session 

470 

471 logger.info(f"Getting results for benchmark {benchmark_run_id}") 

472 username = flask_session.get("username") 

473 

474 # First sync any pending results from active runs 

475 benchmark_service.sync_pending_results(benchmark_run_id, username) 

476 with get_user_db_session(username) as session: 

477 # Get recent results (limit to last 10) 

478 limit = int(request.args.get("limit", 10)) 

479 

480 results = ( 

481 session.query(BenchmarkResult) 

482 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id) 

483 # Temporarily show all results including pending evaluations 

484 # .filter( 

485 # BenchmarkResult.is_correct.isnot(None) 

486 # ) # Only completed evaluations 

487 .order_by(BenchmarkResult.id.desc()) # Most recent first 

488 .limit(limit) 

489 .all() 

490 ) 

491 

492 logger.info(f"Found {len(results)} results") 

493 

494 # Build a map of research_id to total search results 

495 search_results_by_research_id = {} 

496 try: 

497 from ...database.models import SearchCall 

498 from ...metrics.search_tracker import get_search_tracker 

499 

500 tracker = get_search_tracker() 

501 

502 # Get all unique research_ids from our results 

503 research_ids = [r.research_id for r in results if r.research_id] 

504 

505 if research_ids: 

506 with tracker.db.get_session() as metric_session: 

507 # Get all search calls for these research_ids 

508 all_search_calls = ( 

509 metric_session.query(SearchCall) 

510 .filter(SearchCall.research_id.in_(research_ids)) 

511 .all() 

512 ) 

513 

514 # Group search results by research_id 

515 for call in all_search_calls: 

516 if call.research_id: 

517 if ( 

518 call.research_id 

519 not in search_results_by_research_id 

520 ): 

521 search_results_by_research_id[ 

522 call.research_id 

523 ] = 0 

524 search_results_by_research_id[ 

525 call.research_id 

526 ] += call.results_count or 0 

527 

528 logger.info( 

529 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls" 

530 ) 

531 logger.debug( 

532 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}" 

533 ) 

534 logger.debug( 

535 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}" 

536 ) 

537 except Exception: 

538 logger.exception( 

539 f"Error getting search metrics for benchmark {benchmark_run_id}" 

540 ) 

541 

542 # Format results for UI display 

543 formatted_results = [] 

544 for result in results: 

545 # Get search result count using research_id 

546 search_result_count = 0 

547 

548 try: 

549 if ( 

550 result.research_id 

551 and result.research_id in search_results_by_research_id 

552 ): 

553 search_result_count = search_results_by_research_id[ 

554 result.research_id 

555 ] 

556 logger.debug( 

557 f"Found {search_result_count} search results for research_id {result.research_id}" 

558 ) 

559 

560 except Exception: 

561 logger.exception( 

562 f"Error getting search results for result {result.example_id}" 

563 ) 

564 

565 formatted_results.append( 

566 { 

567 "example_id": result.example_id, 

568 "dataset_type": result.dataset_type.value, 

569 "question": result.question, 

570 "correct_answer": result.correct_answer, 

571 "model_answer": result.extracted_answer, 

572 "full_response": result.response, 

573 "is_correct": result.is_correct, 

574 "confidence": result.confidence, 

575 "grader_response": result.grader_response, 

576 "processing_time": result.processing_time, 

577 "search_result_count": search_result_count, 

578 "sources": result.sources, 

579 "completed_at": result.completed_at.isoformat() 

580 if result.completed_at 

581 else None, 

582 } 

583 ) 

584 

585 return jsonify({"success": True, "results": formatted_results}) 

586 

587 except Exception: 

588 logger.exception("Error getting benchmark results") 

589 return jsonify( 

590 {"success": False, "error": "An internal error has occurred."} 

591 ), 500 

592 

593 

594@benchmark_bp.route("/api/configs", methods=["GET"]) 

595@login_required 

596def get_saved_configs(): 

597 """Get list of saved benchmark configurations.""" 

598 try: 

599 # TODO: Implement saved configs retrieval from database 

600 # For now return default configs 

601 default_configs = [ 

602 { 

603 "id": 1, 

604 "name": "Quick Test", 

605 "description": "Fast benchmark with minimal examples", 

606 "search_config": { 

607 "iterations": 3, 

608 "questions_per_iteration": 3, 

609 "search_tool": "searxng", 

610 "search_strategy": "focused_iteration", 

611 }, 

612 "datasets_config": { 

613 "simpleqa": {"count": 10}, 

614 "browsecomp": {"count": 5}, 

615 }, 

616 }, 

617 { 

618 "id": 2, 

619 "name": "Standard Evaluation", 

620 "description": "Comprehensive benchmark with standard settings", 

621 "search_config": { 

622 "iterations": 8, 

623 "questions_per_iteration": 5, 

624 "search_tool": "searxng", 

625 "search_strategy": "focused_iteration", 

626 }, 

627 "datasets_config": { 

628 "simpleqa": {"count": 50}, 

629 "browsecomp": {"count": 25}, 

630 }, 

631 }, 

632 ] 

633 

634 return jsonify({"success": True, "configs": default_configs}) 

635 

636 except Exception: 

637 logger.exception("Error getting saved configs") 

638 return jsonify( 

639 {"success": False, "error": "An internal error has occurred."} 

640 ), 500 

641 

642 

643@benchmark_bp.route("/api/start-simple", methods=["POST"]) 

644@login_required 

645def start_benchmark_simple(): 

646 """Start a benchmark using current database settings.""" 

647 try: 

648 data = request.get_json() 

649 datasets_config = data.get("datasets_config", {}) 

650 

651 # Validate datasets 

652 if not datasets_config or not any( 652 ↛ 662line 652 didn't jump to line 662 because the condition on line 652 was always true

653 config.get("count", 0) > 0 for config in datasets_config.values() 

654 ): 

655 return jsonify( 

656 { 

657 "error": "At least one dataset with count > 0 must be specified" 

658 } 

659 ), 400 

660 

661 # Get current settings from database 

662 from flask import session as flask_session 

663 

664 username = flask_session.get("username") 

665 session_id = flask_session.get("session_id") 

666 

667 # Try to get password from session store for background thread 

668 from ...database.session_passwords import session_password_store 

669 

670 user_password = None 

671 if session_id: 

672 user_password = session_password_store.get_session_password( 

673 username, session_id 

674 ) 

675 

676 with get_user_db_session(username, user_password) as session: 

677 # For benchmarks, use a default test username 

678 settings_manager = SettingsManager(session, "benchmark_user") 

679 

680 # Build search config from database settings 

681 search_config = { 

682 "iterations": int( 

683 settings_manager.get_setting("search.iterations", 8) 

684 ), 

685 "questions_per_iteration": int( 

686 settings_manager.get_setting( 

687 "search.questions_per_iteration", 5 

688 ) 

689 ), 

690 "search_tool": settings_manager.get_setting( 

691 "search.tool", "searxng" 

692 ), 

693 "search_strategy": settings_manager.get_setting( 

694 "search.search_strategy", "focused_iteration" 

695 ), 

696 "model_name": settings_manager.get_setting("llm.model"), 

697 "provider": settings_manager.get_setting("llm.provider"), 

698 "temperature": float( 

699 settings_manager.get_setting("llm.temperature", 0.7) 

700 ), 

701 } 

702 

703 # Add provider-specific settings 

704 provider = search_config.get("provider") 

705 if provider == "openai_endpoint": 

706 search_config["openai_endpoint_url"] = ( 

707 settings_manager.get_setting("llm.openai_endpoint.url") 

708 ) 

709 search_config["openai_endpoint_api_key"] = ( 

710 settings_manager.get_setting("llm.openai_endpoint.api_key") 

711 ) 

712 elif provider == "openai": 

713 search_config["openai_api_key"] = settings_manager.get_setting( 

714 "llm.openai.api_key" 

715 ) 

716 elif provider == "anthropic": 

717 search_config["anthropic_api_key"] = ( 

718 settings_manager.get_setting("llm.anthropic.api_key") 

719 ) 

720 

721 # Read evaluation config from database settings 

722 evaluation_provider = settings_manager.get_setting( 

723 "benchmark.evaluation.provider", "openai_endpoint" 

724 ) 

725 evaluation_model = settings_manager.get_setting( 

726 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

727 ) 

728 evaluation_temperature = float( 

729 settings_manager.get_setting( 

730 "benchmark.evaluation.temperature", 0 

731 ) 

732 ) 

733 

734 evaluation_config = { 

735 "provider": evaluation_provider, 

736 "model_name": evaluation_model, 

737 "temperature": evaluation_temperature, 

738 } 

739 

740 # Add provider-specific settings for evaluation 

741 if evaluation_provider == "openai_endpoint": 

742 evaluation_config["openai_endpoint_url"] = ( 

743 settings_manager.get_setting( 

744 "benchmark.evaluation.endpoint_url", 

745 "https://openrouter.ai/api/v1", 

746 ) 

747 ) 

748 evaluation_config["openai_endpoint_api_key"] = ( 

749 settings_manager.get_setting("llm.openai_endpoint.api_key") 

750 ) 

751 elif evaluation_provider == "openai": 

752 evaluation_config["openai_api_key"] = ( 

753 settings_manager.get_setting("llm.openai.api_key") 

754 ) 

755 elif evaluation_provider == "anthropic": 

756 evaluation_config["anthropic_api_key"] = ( 

757 settings_manager.get_setting("llm.anthropic.api_key") 

758 ) 

759 

760 # Create and start benchmark 

761 benchmark_run_id = benchmark_service.create_benchmark_run( 

762 run_name=f"Quick Benchmark - {data.get('run_name', '')}", 

763 search_config=search_config, 

764 evaluation_config=evaluation_config, 

765 datasets_config=datasets_config, 

766 username=username, 

767 user_password=user_password, 

768 ) 

769 

770 success = benchmark_service.start_benchmark( 

771 benchmark_run_id, username, user_password 

772 ) 

773 

774 if success: 

775 return jsonify( 

776 { 

777 "success": True, 

778 "benchmark_run_id": benchmark_run_id, 

779 "message": "Benchmark started with current settings", 

780 } 

781 ) 

782 else: 

783 return jsonify( 

784 {"success": False, "error": "Failed to start benchmark"} 

785 ), 500 

786 

787 except Exception: 

788 logger.exception("Error starting simple benchmark") 

789 return jsonify( 

790 {"success": False, "error": "An internal error has occurred."} 

791 ), 500 

792 

793 

794@benchmark_bp.route("/api/validate-config", methods=["POST"]) 

795@login_required 

796def validate_config(): 

797 """Validate a benchmark configuration.""" 

798 try: 

799 data = request.get_json() 

800 

801 if not data: 801 ↛ 802line 801 didn't jump to line 802 because the condition on line 801 was never true

802 return jsonify({"valid": False, "errors": ["No data provided"]}) 

803 

804 errors = [] 

805 

806 # Validate search config 

807 search_config = data.get("search_config", {}) 

808 if not search_config.get("search_tool"): 808 ↛ 810line 808 didn't jump to line 810 because the condition on line 808 was always true

809 errors.append("Search tool is required") 

810 if not search_config.get("search_strategy"): 810 ↛ 814line 810 didn't jump to line 814 because the condition on line 810 was always true

811 errors.append("Search strategy is required") 

812 

813 # Validate datasets config 

814 datasets_config = data.get("datasets_config", {}) 

815 if not datasets_config: 815 ↛ 818line 815 didn't jump to line 818 because the condition on line 815 was always true

816 errors.append("At least one dataset must be configured") 

817 

818 total_examples = sum( 

819 config.get("count", 0) for config in datasets_config.values() 

820 ) 

821 if total_examples == 0: 821 ↛ 824line 821 didn't jump to line 824 because the condition on line 821 was always true

822 errors.append("Total examples must be greater than 0") 

823 

824 if total_examples > 1000: 824 ↛ 825line 824 didn't jump to line 825 because the condition on line 824 was never true

825 errors.append( 

826 "Total examples should not exceed 1000 for web interface" 

827 ) 

828 

829 return jsonify( 

830 { 

831 "valid": len(errors) == 0, 

832 "errors": errors, 

833 "total_examples": total_examples, 

834 } 

835 ) 

836 

837 except Exception: 

838 logger.exception("Error validating config") 

839 return jsonify( 

840 {"valid": False, "errors": ["An internal error has occurred."]} 

841 ), 500 

842 

843 

844@benchmark_bp.route("/api/search-quality", methods=["GET"]) 

845@limiter.exempt 

846@login_required 

847def get_search_quality(): 

848 """Get current search quality metrics from rate limiting tracker.""" 

849 try: 

850 from ...web_search_engines.rate_limiting import get_tracker 

851 

852 tracker = get_tracker() 

853 quality_stats = tracker.get_search_quality_stats() 

854 

855 return jsonify( 

856 { 

857 "success": True, 

858 "search_quality": quality_stats, 

859 "timestamp": time.time(), 

860 } 

861 ) 

862 

863 except Exception: 

864 logger.exception("Error getting search quality") 

865 return jsonify( 

866 {"success": False, "error": "An internal error has occurred."} 

867 ), 500 

868 

869 

870@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"]) 

871@login_required 

872def delete_benchmark_run(benchmark_run_id: int): 

873 """Delete a benchmark run and all its results.""" 

874 try: 

875 from ...database.models.benchmark import ( 

876 BenchmarkProgress, 

877 BenchmarkResult, 

878 BenchmarkRun, 

879 ) 

880 from ...database.session_context import get_user_db_session 

881 from flask import session as flask_session 

882 

883 username = flask_session.get("username") 

884 with get_user_db_session(username) as session: 

885 # Check if benchmark run exists 

886 benchmark_run = ( 

887 session.query(BenchmarkRun) 

888 .filter(BenchmarkRun.id == benchmark_run_id) 

889 .first() 

890 ) 

891 

892 if not benchmark_run: 

893 return jsonify( 

894 {"success": False, "error": "Benchmark run not found"} 

895 ), 404 

896 

897 # Prevent deletion of running benchmarks 

898 if benchmark_run.status.value == "in_progress": 

899 return jsonify( 

900 { 

901 "success": False, 

902 "error": "Cannot delete a running benchmark. Cancel it first.", 

903 } 

904 ), 400 

905 

906 # Delete related records (cascade should handle this, but being explicit) 

907 session.query(BenchmarkResult).filter( 

908 BenchmarkResult.benchmark_run_id == benchmark_run_id 

909 ).delete() 

910 

911 session.query(BenchmarkProgress).filter( 

912 BenchmarkProgress.benchmark_run_id == benchmark_run_id 

913 ).delete() 

914 

915 # Delete the benchmark run 

916 session.delete(benchmark_run) 

917 session.commit() 

918 

919 logger.info(f"Deleted benchmark run {benchmark_run_id}") 

920 return jsonify( 

921 { 

922 "success": True, 

923 "message": f"Benchmark run {benchmark_run_id} deleted successfully", 

924 } 

925 ) 

926 

927 except Exception: 

928 logger.exception(f"Error deleting benchmark run {benchmark_run_id}") 

929 return jsonify( 

930 {"success": False, "error": "An internal error has occurred."} 

931 ), 500