Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 96%

352 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Flask routes for benchmark web interface.""" 

2 

3import time 

4 

5from flask import Blueprint, jsonify, request 

6from loguru import logger 

7 

8from ...database.session_context import get_user_db_session 

9from ...security.decorators import require_json_body 

10from ...web.auth.decorators import login_required 

11from ...security.rate_limiter import limiter 

12from local_deep_research.settings import SettingsManager 

13from ...llm.providers.base import normalize_provider 

14from ...web.utils.templates import render_template_with_defaults 

15from .benchmark_service import benchmark_service 

16 

17# Create blueprint for benchmark routes 

18benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark") 

19 

20# NOTE: Routes use flask_session["username"] (not .get()) intentionally. 

21# @login_required guarantees the key exists; direct access fails fast 

22# if the decorator is ever removed. 

23 

24 

25@benchmark_bp.route("/") 

26@login_required 

27def index(): 

28 """Benchmark dashboard page.""" 

29 from flask import session as flask_session 

30 

31 username = flask_session["username"] 

32 with get_user_db_session(username) as db_session: 

33 settings_manager = SettingsManager(db_session) 

34 

35 # Load evaluation settings from database 

36 eval_settings = { 

37 "evaluation_provider": settings_manager.get_setting( 

38 "benchmark.evaluation.provider", "openai_endpoint" 

39 ), 

40 "evaluation_model": settings_manager.get_setting( 

41 "benchmark.evaluation.model", "" 

42 ), 

43 "evaluation_endpoint_url": settings_manager.get_setting( 

44 "benchmark.evaluation.endpoint_url", "" 

45 ), 

46 "evaluation_temperature": settings_manager.get_setting( 

47 "benchmark.evaluation.temperature", 0 

48 ), 

49 } 

50 

51 return render_template_with_defaults( 

52 "pages/benchmark.html", eval_settings=eval_settings 

53 ) 

54 

55 

56@benchmark_bp.route("/results") 

57@login_required 

58def results(): 

59 """Benchmark results history page.""" 

60 return render_template_with_defaults("pages/benchmark_results.html") 

61 

62 

63@benchmark_bp.route("/api/start", methods=["POST"]) 

64@login_required 

65@require_json_body(error_message="No data provided") 

66def start_benchmark(): 

67 """Start a new benchmark run.""" 

68 try: 

69 data = request.get_json() 

70 

71 # Extract configuration 

72 run_name = data.get("run_name") 

73 

74 # Get search config from database instead of request 

75 from ...database.session_context import get_user_db_session 

76 from local_deep_research.settings import SettingsManager 

77 from flask import session as flask_session 

78 

79 username = flask_session["username"] 

80 session_id = flask_session.get("session_id") 

81 

82 # Try to get password from session store for background thread 

83 from ...database.session_passwords import session_password_store 

84 

85 user_password = None 

86 if session_id: 

87 user_password = session_password_store.get_session_password( 

88 username, session_id 

89 ) 

90 

91 search_config = {} 

92 evaluation_config = {} 

93 datasets_config = data.get("datasets_config", {}) 

94 

95 with get_user_db_session(username) as db_session: 

96 # Use the logged-in user's settings 

97 settings_manager = SettingsManager(db_session) 

98 

99 # Build search config from database settings 

100 search_config = { 

101 "iterations": int( 

102 settings_manager.get_setting("search.iterations", 8) 

103 ), 

104 "questions_per_iteration": int( 

105 settings_manager.get_setting( 

106 "search.questions_per_iteration", 5 

107 ) 

108 ), 

109 "search_tool": settings_manager.get_setting( 

110 "search.tool", "searxng" 

111 ), 

112 "search_strategy": settings_manager.get_setting( 

113 "search.search_strategy", "focused_iteration" 

114 ), 

115 "model_name": settings_manager.get_setting("llm.model"), 

116 "provider": settings_manager.get_setting("llm.provider"), 

117 "temperature": float( 

118 settings_manager.get_setting("llm.temperature", 0.7) 

119 ), 

120 "max_tokens": settings_manager.get_setting( 

121 "llm.max_tokens", 30000 

122 ), 

123 "context_window_unrestricted": settings_manager.get_setting( 

124 "llm.context_window_unrestricted", True 

125 ), 

126 "context_window_size": settings_manager.get_setting( 

127 "llm.context_window_size", 128000 

128 ), 

129 "local_context_window_size": settings_manager.get_setting( 

130 "llm.local_context_window_size", 8192 

131 ), 

132 } 

133 

134 # Add provider-specific settings 

135 provider = normalize_provider(search_config.get("provider")) 

136 if provider == "openai_endpoint": 

137 search_config["openai_endpoint_url"] = ( 

138 settings_manager.get_setting("llm.openai_endpoint.url") 

139 ) 

140 search_config["openai_endpoint_api_key"] = ( 

141 settings_manager.get_setting("llm.openai_endpoint.api_key") 

142 ) 

143 elif provider == "openai": 

144 search_config["openai_api_key"] = settings_manager.get_setting( 

145 "llm.openai.api_key" 

146 ) 

147 elif provider == "anthropic": 147 ↛ 153line 147 didn't jump to line 153 because the condition on line 147 was always true

148 search_config["anthropic_api_key"] = ( 

149 settings_manager.get_setting("llm.anthropic.api_key") 

150 ) 

151 

152 # Get evaluation config from database settings or request 

153 if "evaluation_config" in data: 

154 evaluation_config = data["evaluation_config"] 

155 else: 

156 # Read evaluation config from database settings 

157 evaluation_provider = normalize_provider( 

158 settings_manager.get_setting( 

159 "benchmark.evaluation.provider", "openai_endpoint" 

160 ) 

161 ) 

162 evaluation_model = settings_manager.get_setting( 

163 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

164 ) 

165 evaluation_temperature = float( 

166 settings_manager.get_setting( 

167 "benchmark.evaluation.temperature", 0 

168 ) 

169 ) 

170 

171 evaluation_config = { 

172 "provider": evaluation_provider, 

173 "model_name": evaluation_model, 

174 "temperature": evaluation_temperature, 

175 } 

176 

177 # Add provider-specific settings for evaluation 

178 if evaluation_provider == "openai_endpoint": 

179 evaluation_config["openai_endpoint_url"] = ( 

180 settings_manager.get_setting( 

181 "benchmark.evaluation.endpoint_url", 

182 "https://openrouter.ai/api/v1", 

183 ) 

184 ) 

185 evaluation_config["openai_endpoint_api_key"] = ( 

186 settings_manager.get_setting( 

187 "llm.openai_endpoint.api_key" 

188 ) 

189 ) 

190 elif evaluation_provider == "openai": 

191 evaluation_config["openai_api_key"] = ( 

192 settings_manager.get_setting("llm.openai.api_key") 

193 ) 

194 elif evaluation_provider == "anthropic": 194 ↛ 200line 194 didn't jump to line 200

195 evaluation_config["anthropic_api_key"] = ( 

196 settings_manager.get_setting("llm.anthropic.api_key") 

197 ) 

198 

199 # Validate datasets config 

200 if not datasets_config or not any( 

201 config.get("count", 0) > 0 for config in datasets_config.values() 

202 ): 

203 return jsonify( 

204 { 

205 "error": "At least one dataset with count > 0 must be specified" 

206 } 

207 ), 400 

208 

209 # Create benchmark run 

210 benchmark_run_id = benchmark_service.create_benchmark_run( 

211 run_name=run_name, 

212 search_config=search_config, 

213 evaluation_config=evaluation_config, 

214 datasets_config=datasets_config, 

215 username=username, 

216 user_password=user_password, 

217 ) 

218 

219 # Start benchmark 

220 success = benchmark_service.start_benchmark( 

221 benchmark_run_id, username, user_password 

222 ) 

223 

224 if success: 

225 return jsonify( 

226 { 

227 "success": True, 

228 "benchmark_run_id": benchmark_run_id, 

229 "message": "Benchmark started successfully", 

230 } 

231 ) 

232 return jsonify( 

233 {"success": False, "error": "Failed to start benchmark"} 

234 ), 500 

235 

236 except Exception: 

237 logger.exception("Error starting benchmark") 

238 return jsonify( 

239 {"success": False, "error": "An internal error has occurred."} 

240 ), 500 

241 

242 

243@benchmark_bp.route("/api/running", methods=["GET"]) 

244@login_required 

245def get_running_benchmark(): 

246 """Check if there's a running benchmark and return its ID.""" 

247 try: 

248 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus 

249 from ...database.session_context import get_user_db_session 

250 from flask import session as flask_session 

251 

252 username = flask_session["username"] 

253 with get_user_db_session(username) as session: 

254 # Find any benchmark that's currently running 

255 running_benchmark = ( 

256 session.query(BenchmarkRun) 

257 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS) 

258 .order_by(BenchmarkRun.created_at.desc()) 

259 .first() 

260 ) 

261 

262 if running_benchmark: 

263 return jsonify( 

264 { 

265 "success": True, 

266 "benchmark_run_id": running_benchmark.id, 

267 "run_name": running_benchmark.run_name, 

268 "total_examples": running_benchmark.total_examples, 

269 "completed_examples": running_benchmark.completed_examples, 

270 } 

271 ) 

272 return jsonify( 

273 {"success": False, "message": "No running benchmark found"} 

274 ) 

275 

276 except Exception: 

277 logger.exception("Error checking for running benchmark") 

278 return jsonify( 

279 {"success": False, "error": "An internal error has occurred."} 

280 ), 500 

281 

282 

283@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"]) 

284@limiter.exempt 

285@login_required 

286def get_benchmark_status(benchmark_run_id: int): 

287 """Get status of a benchmark run.""" 

288 try: 

289 from flask import session as flask_session 

290 

291 username = flask_session["username"] 

292 status = benchmark_service.get_benchmark_status( 

293 benchmark_run_id, username 

294 ) 

295 

296 if status: 

297 logger.info( 

298 f"Returning status for benchmark {benchmark_run_id}: " 

299 f"completed={status.get('completed_examples')}, " 

300 f"overall_acc={status.get('overall_accuracy')}, " 

301 f"avg_time={status.get('avg_time_per_example')}, " 

302 f"estimated_remaining={status.get('estimated_time_remaining')}" 

303 ) 

304 return jsonify({"success": True, "status": status}) 

305 return jsonify( 

306 {"success": False, "error": "Benchmark run not found"} 

307 ), 404 

308 

309 except Exception: 

310 logger.exception("Error getting benchmark status") 

311 return jsonify( 

312 {"success": False, "error": "An internal error has occurred."} 

313 ), 500 

314 

315 

316@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"]) 

317@login_required 

318def cancel_benchmark(benchmark_run_id: int): 

319 """Cancel a running benchmark.""" 

320 try: 

321 from flask import session as flask_session 

322 

323 username = flask_session["username"] 

324 success = benchmark_service.cancel_benchmark(benchmark_run_id, username) 

325 

326 if success: 

327 return jsonify( 

328 {"success": True, "message": "Benchmark cancelled successfully"} 

329 ) 

330 return jsonify( 

331 {"success": False, "error": "Failed to cancel benchmark"} 

332 ), 500 

333 

334 except Exception: 

335 logger.exception("Error cancelling benchmark") 

336 return jsonify( 

337 {"success": False, "error": "An internal error has occurred."} 

338 ), 500 

339 

340 

341@benchmark_bp.route("/api/history", methods=["GET"]) 

342@login_required 

343def get_benchmark_history(): 

344 """Get list of recent benchmark runs.""" 

345 try: 

346 from ...database.models.benchmark import BenchmarkRun 

347 from ...database.session_context import get_user_db_session 

348 from flask import session as flask_session 

349 

350 username = flask_session["username"] 

351 with get_user_db_session(username) as session: 

352 # Get all benchmark runs (completed, failed, cancelled, or in-progress) 

353 runs = ( 

354 session.query(BenchmarkRun) 

355 .order_by(BenchmarkRun.created_at.desc()) 

356 .limit(50) 

357 .all() 

358 ) 

359 

360 # Format runs for display 

361 formatted_runs = [] 

362 for run in runs: 

363 # Calculate average processing time from results 

364 avg_processing_time = None 

365 avg_search_results = None 

366 try: 

367 from sqlalchemy import func 

368 

369 from ...database.models.benchmark import BenchmarkResult 

370 

371 avg_result = ( 

372 session.query(func.avg(BenchmarkResult.processing_time)) 

373 .filter( 

374 BenchmarkResult.benchmark_run_id == run.id, 

375 BenchmarkResult.processing_time.isnot(None), 

376 BenchmarkResult.processing_time > 0, 

377 ) 

378 .scalar() 

379 ) 

380 

381 if avg_result: 

382 avg_processing_time = float(avg_result) 

383 except Exception: 

384 logger.warning( 

385 f"Error calculating avg processing time for run {run.id}" 

386 ) 

387 

388 # Calculate average search results and total search requests from metrics 

389 total_search_requests = None 

390 try: 

391 from ...database.models import SearchCall 

392 

393 # Get all results for this run to find research_ids 

394 results = ( 

395 session.query(BenchmarkResult) 

396 .filter(BenchmarkResult.benchmark_run_id == run.id) 

397 .all() 

398 ) 

399 

400 research_ids = [ 

401 r.research_id for r in results if r.research_id 

402 ] 

403 

404 if research_ids: 

405 # SearchCall is in the same per-user DB, query directly 

406 search_calls = ( 

407 session.query(SearchCall) 

408 .filter(SearchCall.research_id.in_(research_ids)) 

409 .all() 

410 ) 

411 

412 # Group by research_id and calculate metrics per research session 

413 research_results = {} 

414 research_requests = {} 

415 

416 for call in search_calls: 

417 if call.research_id: 417 ↛ 416line 417 didn't jump to line 416 because the condition on line 417 was always true

418 if call.research_id not in research_results: 418 ↛ 421line 418 didn't jump to line 421 because the condition on line 418 was always true

419 research_results[call.research_id] = 0 

420 research_requests[call.research_id] = 0 

421 research_results[call.research_id] += ( 

422 call.results_count or 0 

423 ) 

424 research_requests[call.research_id] += 1 

425 

426 # Calculate averages across research sessions 

427 if research_results: 427 ↛ 443line 427 didn't jump to line 443 because the condition on line 427 was always true

428 total_results = sum(research_results.values()) 

429 avg_search_results = total_results / len( 

430 research_results 

431 ) 

432 

433 total_requests = sum(research_requests.values()) 

434 total_search_requests = total_requests / len( 

435 research_requests 

436 ) 

437 

438 except Exception: 

439 logger.warning( 

440 f"Error calculating search metrics for run {run.id}" 

441 ) 

442 

443 formatted_runs.append( 

444 { 

445 "id": run.id, 

446 "run_name": run.run_name or f"Benchmark #{run.id}", 

447 "created_at": run.created_at.isoformat(), 

448 "total_examples": run.total_examples, 

449 "completed_examples": run.completed_examples, 

450 "overall_accuracy": run.overall_accuracy, 

451 "status": run.status.value, 

452 "search_config": run.search_config, 

453 "evaluation_config": run.evaluation_config, 

454 "datasets_config": run.datasets_config, 

455 "avg_processing_time": avg_processing_time, 

456 "avg_search_results": avg_search_results, 

457 "total_search_requests": total_search_requests, 

458 } 

459 ) 

460 

461 return jsonify({"success": True, "runs": formatted_runs}) 

462 

463 except Exception: 

464 logger.exception("Error getting benchmark history") 

465 return jsonify( 

466 {"success": False, "error": "An internal error has occurred."} 

467 ), 500 

468 

469 

470@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"]) 

471@limiter.exempt 

472@login_required 

473def get_benchmark_results(benchmark_run_id: int): 

474 """Get detailed results for a benchmark run.""" 

475 try: 

476 from ...database.models.benchmark import BenchmarkResult 

477 from ...database.session_context import get_user_db_session 

478 from flask import session as flask_session 

479 

480 logger.info(f"Getting results for benchmark {benchmark_run_id}") 

481 username = flask_session["username"] 

482 

483 # First sync any pending results from active runs 

484 benchmark_service.sync_pending_results(benchmark_run_id, username) 

485 with get_user_db_session(username) as session: 

486 # Get recent results (limit to last 10) 

487 limit = int(request.args.get("limit", 10)) 

488 

489 results = ( 

490 session.query(BenchmarkResult) 

491 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id) 

492 .order_by(BenchmarkResult.id.desc()) # Most recent first 

493 .limit(limit) 

494 .all() 

495 ) 

496 

497 logger.info(f"Found {len(results)} results") 

498 

499 # Build a map of research_id to total search results 

500 search_results_by_research_id = {} 

501 try: 

502 from ...database.models import SearchCall 

503 

504 # Get all unique research_ids from our results 

505 research_ids = [r.research_id for r in results if r.research_id] 

506 

507 if research_ids: 

508 # SearchCall is in the same per-user DB, query directly 

509 all_search_calls = ( 

510 session.query(SearchCall) 

511 .filter(SearchCall.research_id.in_(research_ids)) 

512 .all() 

513 ) 

514 

515 # Group search results by research_id 

516 for call in all_search_calls: 

517 if call.research_id: 517 ↛ 516line 517 didn't jump to line 516 because the condition on line 517 was always true

518 if ( 518 ↛ 525line 518 didn't jump to line 525 because the condition on line 518 was always true

519 call.research_id 

520 not in search_results_by_research_id 

521 ): 

522 search_results_by_research_id[ 

523 call.research_id 

524 ] = 0 

525 search_results_by_research_id[call.research_id] += ( 

526 call.results_count or 0 

527 ) 

528 

529 logger.info( 

530 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls" 

531 ) 

532 logger.debug( 

533 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}" 

534 ) 

535 logger.debug( 

536 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}" 

537 ) 

538 except Exception: 

539 logger.exception( 

540 f"Error getting search metrics for benchmark {benchmark_run_id}" 

541 ) 

542 

543 # Format results for UI display 

544 formatted_results = [] 

545 for result in results: 

546 # Get search result count using research_id 

547 search_result_count = 0 

548 

549 try: 

550 if ( 

551 result.research_id 

552 and result.research_id in search_results_by_research_id 

553 ): 

554 search_result_count = search_results_by_research_id[ 

555 result.research_id 

556 ] 

557 logger.debug( 

558 f"Found {search_result_count} search results for research_id {result.research_id}" 

559 ) 

560 

561 except Exception: 

562 logger.exception( 

563 f"Error getting search results for result {result.example_id}" 

564 ) 

565 

566 formatted_results.append( 

567 { 

568 "example_id": result.example_id, 

569 "dataset_type": result.dataset_type.value, 

570 "question": result.question, 

571 "correct_answer": result.correct_answer, 

572 "model_answer": result.extracted_answer, 

573 "full_response": result.response, 

574 "is_correct": result.is_correct, 

575 "confidence": result.confidence, 

576 "grader_response": result.grader_response, 

577 "processing_time": result.processing_time, 

578 "search_result_count": search_result_count, 

579 "sources": result.sources, 

580 "completed_at": result.completed_at.isoformat() 

581 if result.completed_at 

582 else None, 

583 } 

584 ) 

585 

586 return jsonify({"success": True, "results": formatted_results}) 

587 

588 except Exception: 

589 logger.exception("Error getting benchmark results") 

590 return jsonify( 

591 {"success": False, "error": "An internal error has occurred."} 

592 ), 500 

593 

594 

595@benchmark_bp.route( 

596 "/api/results/<int:benchmark_run_id>/export", methods=["GET"] 

597) 

598@login_required 

599def export_benchmark_results(benchmark_run_id: int): 

600 """Get lightweight results for YAML export (no full_response/sources/grader_response).""" 

601 try: 

602 from sqlalchemy.orm import load_only 

603 

604 from ...database.models.benchmark import BenchmarkResult 

605 from ...database.session_context import get_user_db_session 

606 from flask import session as flask_session 

607 

608 username = flask_session["username"] 

609 logger.info( 

610 "Exporting benchmark results for run {} by user {}", 

611 benchmark_run_id, 

612 username, 

613 ) 

614 with get_user_db_session(username) as session: 

615 results = ( 

616 session.query(BenchmarkResult) 

617 .options( 

618 load_only( 

619 BenchmarkResult.example_id, 

620 BenchmarkResult.dataset_type, 

621 BenchmarkResult.question, 

622 BenchmarkResult.correct_answer, 

623 BenchmarkResult.extracted_answer, 

624 BenchmarkResult.is_correct, 

625 BenchmarkResult.confidence, 

626 BenchmarkResult.processing_time, 

627 BenchmarkResult.completed_at, 

628 ) 

629 ) 

630 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id) 

631 .order_by(BenchmarkResult.id.asc()) 

632 .all() 

633 ) 

634 

635 formatted = [] 

636 for r in results: 

637 formatted.append( 

638 { 

639 "example_id": r.example_id, 

640 "dataset_type": r.dataset_type.value, 

641 "question": r.question, 

642 "correct_answer": r.correct_answer, 

643 "model_answer": r.extracted_answer, 

644 "is_correct": r.is_correct, 

645 "confidence": r.confidence, 

646 "processing_time": r.processing_time, 

647 "completed_at": r.completed_at.isoformat() 

648 if r.completed_at 

649 else None, 

650 } 

651 ) 

652 

653 logger.info( 

654 "Exported {} results for benchmark run {}", 

655 len(formatted), 

656 benchmark_run_id, 

657 ) 

658 return jsonify({"success": True, "results": formatted}) 

659 

660 except Exception: 

661 logger.exception("Error exporting benchmark results") 

662 return jsonify( 

663 {"success": False, "error": "An internal error has occurred."} 

664 ), 500 

665 

666 

667@benchmark_bp.route("/api/configs", methods=["GET"]) 

668@login_required 

669def get_saved_configs(): 

670 """Get list of saved benchmark configurations.""" 

671 try: 

672 # TODO: Implement saved configs retrieval from database 

673 # For now return default configs 

674 default_configs = [ 

675 { 

676 "id": 1, 

677 "name": "Quick Test", 

678 "description": "Fast benchmark with minimal examples", 

679 "search_config": { 

680 "iterations": 3, 

681 "questions_per_iteration": 3, 

682 "search_tool": "searxng", 

683 "search_strategy": "focused_iteration", 

684 }, 

685 "datasets_config": { 

686 "simpleqa": {"count": 10}, 

687 "browsecomp": {"count": 5}, 

688 }, 

689 }, 

690 { 

691 "id": 2, 

692 "name": "Standard Evaluation", 

693 "description": "Comprehensive benchmark with standard settings", 

694 "search_config": { 

695 "iterations": 8, 

696 "questions_per_iteration": 5, 

697 "search_tool": "searxng", 

698 "search_strategy": "focused_iteration", 

699 }, 

700 "datasets_config": { 

701 "simpleqa": {"count": 50}, 

702 "browsecomp": {"count": 25}, 

703 }, 

704 }, 

705 ] 

706 

707 return jsonify({"success": True, "configs": default_configs}) 

708 

709 except Exception: 

710 logger.exception("Error getting saved configs") 

711 return jsonify( 

712 {"success": False, "error": "An internal error has occurred."} 

713 ), 500 

714 

715 

716@benchmark_bp.route("/api/start-simple", methods=["POST"]) 

717@login_required 

718@require_json_body() 

719def start_benchmark_simple(): 

720 """Start a benchmark using current database settings.""" 

721 try: 

722 data = request.get_json() 

723 datasets_config = data.get("datasets_config", {}) 

724 

725 # Validate datasets 

726 if not datasets_config or not any( 

727 config.get("count", 0) > 0 for config in datasets_config.values() 

728 ): 

729 return jsonify( 

730 { 

731 "error": "At least one dataset with count > 0 must be specified" 

732 } 

733 ), 400 

734 

735 # Get current settings from database 

736 from flask import session as flask_session 

737 

738 username = flask_session["username"] 

739 session_id = flask_session.get("session_id") 

740 

741 # Try to get password from session store for background thread 

742 from ...database.session_passwords import session_password_store 

743 

744 user_password = None 

745 if session_id: 

746 user_password = session_password_store.get_session_password( 

747 username, session_id 

748 ) 

749 

750 with get_user_db_session(username, user_password) as session: 

751 # For benchmarks, use a default test username 

752 settings_manager = SettingsManager(session) 

753 

754 # Build search config from database settings 

755 search_config = { 

756 "iterations": int( 

757 settings_manager.get_setting("search.iterations", 8) 

758 ), 

759 "questions_per_iteration": int( 

760 settings_manager.get_setting( 

761 "search.questions_per_iteration", 5 

762 ) 

763 ), 

764 "search_tool": settings_manager.get_setting( 

765 "search.tool", "searxng" 

766 ), 

767 "search_strategy": settings_manager.get_setting( 

768 "search.search_strategy", "focused_iteration" 

769 ), 

770 "model_name": settings_manager.get_setting("llm.model"), 

771 "provider": settings_manager.get_setting("llm.provider"), 

772 "temperature": float( 

773 settings_manager.get_setting("llm.temperature", 0.7) 

774 ), 

775 "max_tokens": settings_manager.get_setting( 

776 "llm.max_tokens", 30000 

777 ), 

778 "context_window_unrestricted": settings_manager.get_setting( 

779 "llm.context_window_unrestricted", True 

780 ), 

781 "context_window_size": settings_manager.get_setting( 

782 "llm.context_window_size", 128000 

783 ), 

784 "local_context_window_size": settings_manager.get_setting( 

785 "llm.local_context_window_size", 8192 

786 ), 

787 } 

788 

789 # Add provider-specific settings 

790 provider = normalize_provider(search_config.get("provider")) 

791 if provider == "openai_endpoint": 

792 search_config["openai_endpoint_url"] = ( 

793 settings_manager.get_setting("llm.openai_endpoint.url") 

794 ) 

795 search_config["openai_endpoint_api_key"] = ( 

796 settings_manager.get_setting("llm.openai_endpoint.api_key") 

797 ) 

798 elif provider == "openai": 

799 search_config["openai_api_key"] = settings_manager.get_setting( 

800 "llm.openai.api_key" 

801 ) 

802 elif provider == "anthropic": 802 ↛ 808line 802 didn't jump to line 808 because the condition on line 802 was always true

803 search_config["anthropic_api_key"] = ( 

804 settings_manager.get_setting("llm.anthropic.api_key") 

805 ) 

806 

807 # Read evaluation config from database settings 

808 evaluation_provider = normalize_provider( 

809 settings_manager.get_setting( 

810 "benchmark.evaluation.provider", "openai_endpoint" 

811 ) 

812 ) 

813 evaluation_model = settings_manager.get_setting( 

814 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

815 ) 

816 evaluation_temperature = float( 

817 settings_manager.get_setting( 

818 "benchmark.evaluation.temperature", 0 

819 ) 

820 ) 

821 

822 evaluation_config = { 

823 "provider": evaluation_provider, 

824 "model_name": evaluation_model, 

825 "temperature": evaluation_temperature, 

826 } 

827 

828 # Add provider-specific settings for evaluation 

829 if evaluation_provider == "openai_endpoint": 

830 evaluation_config["openai_endpoint_url"] = ( 

831 settings_manager.get_setting( 

832 "benchmark.evaluation.endpoint_url", 

833 "https://openrouter.ai/api/v1", 

834 ) 

835 ) 

836 evaluation_config["openai_endpoint_api_key"] = ( 

837 settings_manager.get_setting("llm.openai_endpoint.api_key") 

838 ) 

839 elif evaluation_provider == "openai": 

840 evaluation_config["openai_api_key"] = ( 

841 settings_manager.get_setting("llm.openai.api_key") 

842 ) 

843 elif evaluation_provider == "anthropic": 843 ↛ 849line 843 didn't jump to line 849

844 evaluation_config["anthropic_api_key"] = ( 

845 settings_manager.get_setting("llm.anthropic.api_key") 

846 ) 

847 

848 # Create and start benchmark 

849 benchmark_run_id = benchmark_service.create_benchmark_run( 

850 run_name=f"Quick Benchmark - {data.get('run_name', '')}", 

851 search_config=search_config, 

852 evaluation_config=evaluation_config, 

853 datasets_config=datasets_config, 

854 username=username, 

855 user_password=user_password, 

856 ) 

857 

858 success = benchmark_service.start_benchmark( 

859 benchmark_run_id, username, user_password 

860 ) 

861 

862 if success: 

863 return jsonify( 

864 { 

865 "success": True, 

866 "benchmark_run_id": benchmark_run_id, 

867 "message": "Benchmark started with current settings", 

868 } 

869 ) 

870 return jsonify( 

871 {"success": False, "error": "Failed to start benchmark"} 

872 ), 500 

873 

874 except Exception: 

875 logger.exception("Error starting simple benchmark") 

876 return jsonify( 

877 {"success": False, "error": "An internal error has occurred."} 

878 ), 500 

879 

880 

881@benchmark_bp.route("/api/validate-config", methods=["POST"]) 

882@login_required 

883def validate_config(): 

884 """Validate a benchmark configuration. 

885 

886 Note: not using @require_json_body because this endpoint returns 

887 {"valid": False, "errors": [...]} which doesn't match the decorator's 

888 three standard error formats. 

889 """ 

890 try: 

891 data = request.get_json() 

892 

893 if not isinstance(data, dict): 

894 return jsonify({"valid": False, "errors": ["No data provided"]}) 

895 

896 errors = [] 

897 

898 # Validate search config 

899 search_config = data.get("search_config", {}) 

900 if not search_config.get("search_tool"): 

901 errors.append("Search tool is required") 

902 if not search_config.get("search_strategy"): 

903 errors.append("Search strategy is required") 

904 

905 # Validate datasets config 

906 datasets_config = data.get("datasets_config", {}) 

907 if not datasets_config: 

908 errors.append("At least one dataset must be configured") 

909 

910 total_examples = sum( 

911 config.get("count", 0) for config in datasets_config.values() 

912 ) 

913 if total_examples == 0: 

914 errors.append("Total examples must be greater than 0") 

915 

916 if total_examples > 1000: 

917 errors.append( 

918 "Total examples should not exceed 1000 for web interface" 

919 ) 

920 

921 return jsonify( 

922 { 

923 "valid": len(errors) == 0, 

924 "errors": errors, 

925 "total_examples": total_examples, 

926 } 

927 ) 

928 

929 except Exception: 

930 logger.exception("Error validating config") 

931 return jsonify( 

932 {"valid": False, "errors": ["An internal error has occurred."]} 

933 ), 500 

934 

935 

936@benchmark_bp.route("/api/search-quality", methods=["GET"]) 

937@limiter.exempt 

938@login_required 

939def get_search_quality(): 

940 """Get current search quality metrics from rate limiting tracker.""" 

941 try: 

942 from ...web_search_engines.rate_limiting import get_tracker 

943 

944 tracker = get_tracker() 

945 quality_stats = tracker.get_search_quality_stats() 

946 

947 return jsonify( 

948 { 

949 "success": True, 

950 "search_quality": quality_stats, 

951 "timestamp": time.time(), 

952 } 

953 ) 

954 

955 except Exception: 

956 logger.exception("Error getting search quality") 

957 return jsonify( 

958 {"success": False, "error": "An internal error has occurred."} 

959 ), 500 

960 

961 

962@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"]) 

963@login_required 

964def delete_benchmark_run(benchmark_run_id: int): 

965 """Delete a benchmark run and all its results.""" 

966 try: 

967 from ...database.models.benchmark import ( 

968 BenchmarkProgress, 

969 BenchmarkResult, 

970 BenchmarkRun, 

971 ) 

972 from ...database.session_context import get_user_db_session 

973 from flask import session as flask_session 

974 

975 username = flask_session["username"] 

976 with get_user_db_session(username) as session: 

977 # Check if benchmark run exists 

978 benchmark_run = ( 

979 session.query(BenchmarkRun) 

980 .filter(BenchmarkRun.id == benchmark_run_id) 

981 .first() 

982 ) 

983 

984 if not benchmark_run: 

985 return jsonify( 

986 {"success": False, "error": "Benchmark run not found"} 

987 ), 404 

988 

989 # Prevent deletion of running benchmarks 

990 if benchmark_run.status.value == "in_progress": 

991 return jsonify( 

992 { 

993 "success": False, 

994 "error": "Cannot delete a running benchmark. Cancel it first.", 

995 } 

996 ), 400 

997 

998 # Delete related records (cascade should handle this, but being explicit) 

999 session.query(BenchmarkResult).filter( 

1000 BenchmarkResult.benchmark_run_id == benchmark_run_id 

1001 ).delete() 

1002 

1003 session.query(BenchmarkProgress).filter( 

1004 BenchmarkProgress.benchmark_run_id == benchmark_run_id 

1005 ).delete() 

1006 

1007 # Delete the benchmark run 

1008 session.delete(benchmark_run) 

1009 session.commit() 

1010 

1011 logger.info(f"Deleted benchmark run {benchmark_run_id}") 

1012 return jsonify( 

1013 { 

1014 "success": True, 

1015 "message": f"Benchmark run {benchmark_run_id} deleted successfully", 

1016 } 

1017 ) 

1018 

1019 except Exception: 

1020 logger.exception(f"Error deleting benchmark run {benchmark_run_id}") 

1021 return jsonify( 

1022 {"success": False, "error": "An internal error has occurred."} 

1023 ), 500