Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 37%

356 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""Flask routes for benchmark web interface.""" 

2 

3import time 

4 

5from flask import Blueprint, jsonify, request 

6from loguru import logger 

7 

8from ...database.session_context import get_user_db_session 

9from ...web.auth.decorators import login_required 

10from ...web.utils.rate_limiter import limiter 

11from local_deep_research.settings import SettingsManager 

12from ...web.utils.templates import render_template_with_defaults 

13from .benchmark_service import benchmark_service 

14 

15# Create blueprint for benchmark routes 

16benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark") 

17 

18 

19@benchmark_bp.route("/") 

20@login_required 

21def index(): 

22 """Benchmark dashboard page.""" 

23 from flask import session as flask_session 

24 

25 username = flask_session.get("username") 

26 with get_user_db_session(username) as db_session: 

27 settings_manager = SettingsManager(db_session) 

28 

29 # Load evaluation settings from database 

30 eval_settings = { 

31 "evaluation_provider": settings_manager.get_setting( 

32 "benchmark.evaluation.provider", "openai_endpoint" 

33 ), 

34 "evaluation_model": settings_manager.get_setting( 

35 "benchmark.evaluation.model", "" 

36 ), 

37 "evaluation_endpoint_url": settings_manager.get_setting( 

38 "benchmark.evaluation.endpoint_url", "" 

39 ), 

40 "evaluation_temperature": settings_manager.get_setting( 

41 "benchmark.evaluation.temperature", 0 

42 ), 

43 } 

44 

45 return render_template_with_defaults( 

46 "pages/benchmark.html", eval_settings=eval_settings 

47 ) 

48 

49 

50@benchmark_bp.route("/results") 

51@login_required 

52def results(): 

53 """Benchmark results history page.""" 

54 return render_template_with_defaults("pages/benchmark_results.html") 

55 

56 

57@benchmark_bp.route("/api/start", methods=["POST"]) 

58@login_required 

59def start_benchmark(): 

60 """Start a new benchmark run.""" 

61 try: 

62 data = request.get_json() 

63 

64 if not data: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return jsonify({"error": "No data provided"}), 400 

66 

67 # Extract configuration 

68 run_name = data.get("run_name") 

69 

70 # Get search config from database instead of request 

71 from ...database.session_context import get_user_db_session 

72 from local_deep_research.settings import SettingsManager 

73 from flask import session as flask_session 

74 

75 username = flask_session.get("username") 

76 session_id = flask_session.get("session_id") 

77 

78 # Try to get password from session store for background thread 

79 from ...database.session_passwords import session_password_store 

80 

81 user_password = None 

82 if session_id: 82 ↛ 87line 82 didn't jump to line 87 because the condition on line 82 was always true

83 user_password = session_password_store.get_session_password( 

84 username, session_id 

85 ) 

86 

87 search_config = {} 

88 evaluation_config = {} 

89 datasets_config = data.get("datasets_config", {}) 

90 

91 with get_user_db_session(username) as db_session: 

92 # Use the logged-in user's settings 

93 settings_manager = SettingsManager(db_session) 

94 

95 # Build search config from database settings 

96 search_config = { 

97 "iterations": int( 

98 settings_manager.get_setting("search.iterations", 8) 

99 ), 

100 "questions_per_iteration": int( 

101 settings_manager.get_setting( 

102 "search.questions_per_iteration", 5 

103 ) 

104 ), 

105 "search_tool": settings_manager.get_setting( 

106 "search.tool", "searxng" 

107 ), 

108 "search_strategy": settings_manager.get_setting( 

109 "search.search_strategy", "focused_iteration" 

110 ), 

111 "model_name": settings_manager.get_setting("llm.model"), 

112 "provider": settings_manager.get_setting("llm.provider"), 

113 "temperature": float( 

114 settings_manager.get_setting("llm.temperature", 0.7) 

115 ), 

116 "max_tokens": settings_manager.get_setting( 

117 "llm.max_tokens", 30000 

118 ), 

119 "context_window_unrestricted": settings_manager.get_setting( 

120 "llm.context_window_unrestricted", True 

121 ), 

122 "context_window_size": settings_manager.get_setting( 

123 "llm.context_window_size", 128000 

124 ), 

125 "local_context_window_size": settings_manager.get_setting( 

126 "llm.local_context_window_size", 4096 

127 ), 

128 } 

129 

130 # Add provider-specific settings 

131 provider = search_config.get("provider") 

132 if provider == "openai_endpoint": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 search_config["openai_endpoint_url"] = ( 

134 settings_manager.get_setting("llm.openai_endpoint.url") 

135 ) 

136 search_config["openai_endpoint_api_key"] = ( 

137 settings_manager.get_setting("llm.openai_endpoint.api_key") 

138 ) 

139 elif provider == "openai": 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true

140 search_config["openai_api_key"] = settings_manager.get_setting( 

141 "llm.openai.api_key" 

142 ) 

143 elif provider == "anthropic": 

144 search_config["anthropic_api_key"] = ( 

145 settings_manager.get_setting("llm.anthropic.api_key") 

146 ) 

147 

148 # Get evaluation config from database settings or request 

149 if "evaluation_config" in data: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 evaluation_config = data["evaluation_config"] 

151 else: 

152 # Read evaluation config from database settings 

153 evaluation_provider = settings_manager.get_setting( 

154 "benchmark.evaluation.provider", "openai_endpoint" 

155 ) 

156 evaluation_model = settings_manager.get_setting( 

157 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

158 ) 

159 evaluation_temperature = float( 

160 settings_manager.get_setting( 

161 "benchmark.evaluation.temperature", 0 

162 ) 

163 ) 

164 

165 evaluation_config = { 

166 "provider": evaluation_provider, 

167 "model_name": evaluation_model, 

168 "temperature": evaluation_temperature, 

169 } 

170 

171 # Add provider-specific settings for evaluation 

172 if evaluation_provider == "openai_endpoint": 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 evaluation_config["openai_endpoint_url"] = ( 

174 settings_manager.get_setting( 

175 "benchmark.evaluation.endpoint_url", 

176 "https://openrouter.ai/api/v1", 

177 ) 

178 ) 

179 evaluation_config["openai_endpoint_api_key"] = ( 

180 settings_manager.get_setting( 

181 "llm.openai_endpoint.api_key" 

182 ) 

183 ) 

184 elif evaluation_provider == "openai": 184 ↛ 188line 184 didn't jump to line 188 because the condition on line 184 was always true

185 evaluation_config["openai_api_key"] = ( 

186 settings_manager.get_setting("llm.openai.api_key") 

187 ) 

188 elif evaluation_provider == "anthropic": 

189 evaluation_config["anthropic_api_key"] = ( 

190 settings_manager.get_setting("llm.anthropic.api_key") 

191 ) 

192 

193 # Validate datasets config 

194 if not datasets_config or not any( 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was never true

195 config.get("count", 0) > 0 for config in datasets_config.values() 

196 ): 

197 return jsonify( 

198 { 

199 "error": "At least one dataset with count > 0 must be specified" 

200 } 

201 ), 400 

202 

203 # Create benchmark run 

204 benchmark_run_id = benchmark_service.create_benchmark_run( 

205 run_name=run_name, 

206 search_config=search_config, 

207 evaluation_config=evaluation_config, 

208 datasets_config=datasets_config, 

209 username=username, 

210 user_password=user_password, 

211 ) 

212 

213 # Start benchmark 

214 success = benchmark_service.start_benchmark( 

215 benchmark_run_id, username, user_password 

216 ) 

217 

218 if success: 218 ↛ 227line 218 didn't jump to line 227 because the condition on line 218 was always true

219 return jsonify( 

220 { 

221 "success": True, 

222 "benchmark_run_id": benchmark_run_id, 

223 "message": "Benchmark started successfully", 

224 } 

225 ) 

226 else: 

227 return jsonify( 

228 {"success": False, "error": "Failed to start benchmark"} 

229 ), 500 

230 

231 except Exception: 

232 logger.exception("Error starting benchmark") 

233 return jsonify( 

234 {"success": False, "error": "An internal error has occurred."} 

235 ), 500 

236 

237 

238@benchmark_bp.route("/api/running", methods=["GET"]) 

239@login_required 

240def get_running_benchmark(): 

241 """Check if there's a running benchmark and return its ID.""" 

242 try: 

243 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus 

244 from ...database.session_context import get_user_db_session 

245 from flask import session as flask_session 

246 

247 username = flask_session.get("username") 

248 with get_user_db_session(username) as session: 

249 # Find any benchmark that's currently running 

250 running_benchmark = ( 

251 session.query(BenchmarkRun) 

252 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS) 

253 .order_by(BenchmarkRun.created_at.desc()) 

254 .first() 

255 ) 

256 

257 if running_benchmark: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 return jsonify( 

259 { 

260 "success": True, 

261 "benchmark_run_id": running_benchmark.id, 

262 "run_name": running_benchmark.run_name, 

263 "total_examples": running_benchmark.total_examples, 

264 "completed_examples": running_benchmark.completed_examples, 

265 } 

266 ) 

267 else: 

268 return jsonify( 

269 {"success": False, "message": "No running benchmark found"} 

270 ) 

271 

272 except Exception: 

273 logger.exception("Error checking for running benchmark") 

274 return jsonify( 

275 {"success": False, "error": "An internal error has occurred."} 

276 ), 500 

277 

278 

279@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"]) 

280@limiter.exempt 

281@login_required 

282def get_benchmark_status(benchmark_run_id: int): 

283 """Get status of a benchmark run.""" 

284 try: 

285 from flask import session as flask_session 

286 

287 username = flask_session.get("username") 

288 status = benchmark_service.get_benchmark_status( 

289 benchmark_run_id, username 

290 ) 

291 

292 if status: 

293 logger.info( 

294 f"Returning status for benchmark {benchmark_run_id}: " 

295 f"completed={status.get('completed_examples')}, " 

296 f"overall_acc={status.get('overall_accuracy')}, " 

297 f"avg_time={status.get('avg_time_per_example')}, " 

298 f"estimated_remaining={status.get('estimated_time_remaining')}" 

299 ) 

300 return jsonify({"success": True, "status": status}) 

301 else: 

302 return jsonify( 

303 {"success": False, "error": "Benchmark run not found"} 

304 ), 404 

305 

306 except Exception: 

307 logger.exception("Error getting benchmark status") 

308 return jsonify( 

309 {"success": False, "error": "An internal error has occurred."} 

310 ), 500 

311 

312 

313@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"]) 

314@login_required 

315def cancel_benchmark(benchmark_run_id: int): 

316 """Cancel a running benchmark.""" 

317 try: 

318 from flask import session as flask_session 

319 

320 username = flask_session.get("username") 

321 success = benchmark_service.cancel_benchmark(benchmark_run_id, username) 

322 

323 if success: 

324 return jsonify( 

325 {"success": True, "message": "Benchmark cancelled successfully"} 

326 ) 

327 else: 

328 return jsonify( 

329 {"success": False, "error": "Failed to cancel benchmark"} 

330 ), 500 

331 

332 except Exception: 

333 logger.exception("Error cancelling benchmark") 

334 return jsonify( 

335 {"success": False, "error": "An internal error has occurred."} 

336 ), 500 

337 

338 

339@benchmark_bp.route("/api/history", methods=["GET"]) 

340@login_required 

341def get_benchmark_history(): 

342 """Get list of recent benchmark runs.""" 

343 try: 

344 from ...database.models.benchmark import BenchmarkRun 

345 from ...database.session_context import get_user_db_session 

346 from flask import session as flask_session 

347 

348 username = flask_session.get("username") 

349 with get_user_db_session(username) as session: 

350 # Get all benchmark runs (completed, failed, cancelled, or in-progress) 

351 runs = ( 

352 session.query(BenchmarkRun) 

353 .order_by(BenchmarkRun.created_at.desc()) 

354 .limit(50) 

355 .all() 

356 ) 

357 

358 # Format runs for display 

359 formatted_runs = [] 

360 for run in runs: 360 ↛ 362line 360 didn't jump to line 362 because the loop on line 360 never started

361 # Calculate average processing time from results 

362 avg_processing_time = None 

363 avg_search_results = None 

364 try: 

365 from sqlalchemy import func 

366 

367 from ...database.models.benchmark import BenchmarkResult 

368 

369 avg_result = ( 

370 session.query(func.avg(BenchmarkResult.processing_time)) 

371 .filter( 

372 BenchmarkResult.benchmark_run_id == run.id, 

373 BenchmarkResult.processing_time.isnot(None), 

374 BenchmarkResult.processing_time > 0, 

375 ) 

376 .scalar() 

377 ) 

378 

379 if avg_result: 

380 avg_processing_time = float(avg_result) 

381 except Exception as e: 

382 logger.warning( 

383 f"Error calculating avg processing time for run {run.id}: {e}" 

384 ) 

385 

386 # Calculate average search results and total search requests from metrics 

387 total_search_requests = None 

388 try: 

389 from ...database.models import SearchCall 

390 from ...metrics.search_tracker import get_search_tracker 

391 

392 # Get all results for this run to find research_ids 

393 results = ( 

394 session.query(BenchmarkResult) 

395 .filter(BenchmarkResult.benchmark_run_id == run.id) 

396 .all() 

397 ) 

398 

399 research_ids = [ 

400 r.research_id for r in results if r.research_id 

401 ] 

402 

403 if research_ids: 

404 tracker = get_search_tracker() 

405 with tracker.db.get_session() as metric_session: 

406 # Get all search calls for these research_ids 

407 search_calls = ( 

408 metric_session.query(SearchCall) 

409 .filter( 

410 SearchCall.research_id.in_(research_ids) 

411 ) 

412 .all() 

413 ) 

414 

415 # Group by research_id and calculate metrics per research session 

416 research_results = {} 

417 research_requests = {} 

418 

419 for call in search_calls: 

420 if call.research_id: 

421 if call.research_id not in research_results: 

422 research_results[call.research_id] = 0 

423 research_requests[call.research_id] = 0 

424 research_results[call.research_id] += ( 

425 call.results_count or 0 

426 ) 

427 research_requests[call.research_id] += 1 

428 

429 # Calculate averages across research sessions 

430 if research_results: 

431 total_results = sum(research_results.values()) 

432 avg_search_results = total_results / len( 

433 research_results 

434 ) 

435 

436 total_requests = sum(research_requests.values()) 

437 total_search_requests = total_requests / len( 

438 research_requests 

439 ) 

440 

441 except Exception as e: 

442 logger.warning( 

443 f"Error calculating search metrics for run {run.id}: {e}" 

444 ) 

445 

446 formatted_runs.append( 

447 { 

448 "id": run.id, 

449 "run_name": run.run_name or f"Benchmark #{run.id}", 

450 "created_at": run.created_at.isoformat(), 

451 "total_examples": run.total_examples, 

452 "completed_examples": run.completed_examples, 

453 "overall_accuracy": run.overall_accuracy, 

454 "status": run.status.value, 

455 "search_config": run.search_config, 

456 "evaluation_config": run.evaluation_config, 

457 "datasets_config": run.datasets_config, 

458 "avg_processing_time": avg_processing_time, 

459 "avg_search_results": avg_search_results, 

460 "total_search_requests": total_search_requests, 

461 } 

462 ) 

463 

464 return jsonify({"success": True, "runs": formatted_runs}) 

465 

466 except Exception: 

467 logger.exception("Error getting benchmark history") 

468 return jsonify( 

469 {"success": False, "error": "An internal error has occurred."} 

470 ), 500 

471 

472 

473@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"]) 

474@limiter.exempt 

475@login_required 

476def get_benchmark_results(benchmark_run_id: int): 

477 """Get detailed results for a benchmark run.""" 

478 try: 

479 from ...database.models.benchmark import BenchmarkResult 

480 from ...database.session_context import get_user_db_session 

481 from flask import session as flask_session 

482 

483 logger.info(f"Getting results for benchmark {benchmark_run_id}") 

484 username = flask_session.get("username") 

485 

486 # First sync any pending results from active runs 

487 benchmark_service.sync_pending_results(benchmark_run_id, username) 

488 with get_user_db_session(username) as session: 

489 # Get recent results (limit to last 10) 

490 limit = int(request.args.get("limit", 10)) 

491 

492 results = ( 

493 session.query(BenchmarkResult) 

494 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id) 

495 # Temporarily show all results including pending evaluations 

496 # .filter( 

497 # BenchmarkResult.is_correct.isnot(None) 

498 # ) # Only completed evaluations 

499 .order_by(BenchmarkResult.id.desc()) # Most recent first 

500 .limit(limit) 

501 .all() 

502 ) 

503 

504 logger.info(f"Found {len(results)} results") 

505 

506 # Build a map of research_id to total search results 

507 search_results_by_research_id = {} 

508 try: 

509 from ...database.models import SearchCall 

510 from ...metrics.search_tracker import get_search_tracker 

511 

512 tracker = get_search_tracker() 

513 

514 # Get all unique research_ids from our results 

515 research_ids = [r.research_id for r in results if r.research_id] 

516 

517 if research_ids: 

518 with tracker.db.get_session() as metric_session: 

519 # Get all search calls for these research_ids 

520 all_search_calls = ( 

521 metric_session.query(SearchCall) 

522 .filter(SearchCall.research_id.in_(research_ids)) 

523 .all() 

524 ) 

525 

526 # Group search results by research_id 

527 for call in all_search_calls: 

528 if call.research_id: 

529 if ( 

530 call.research_id 

531 not in search_results_by_research_id 

532 ): 

533 search_results_by_research_id[ 

534 call.research_id 

535 ] = 0 

536 search_results_by_research_id[ 

537 call.research_id 

538 ] += call.results_count or 0 

539 

540 logger.info( 

541 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls" 

542 ) 

543 logger.debug( 

544 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}" 

545 ) 

546 logger.debug( 

547 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}" 

548 ) 

549 except Exception: 

550 logger.exception( 

551 f"Error getting search metrics for benchmark {benchmark_run_id}" 

552 ) 

553 

554 # Format results for UI display 

555 formatted_results = [] 

556 for result in results: 

557 # Get search result count using research_id 

558 search_result_count = 0 

559 

560 try: 

561 if ( 

562 result.research_id 

563 and result.research_id in search_results_by_research_id 

564 ): 

565 search_result_count = search_results_by_research_id[ 

566 result.research_id 

567 ] 

568 logger.debug( 

569 f"Found {search_result_count} search results for research_id {result.research_id}" 

570 ) 

571 

572 except Exception: 

573 logger.exception( 

574 f"Error getting search results for result {result.example_id}" 

575 ) 

576 

577 formatted_results.append( 

578 { 

579 "example_id": result.example_id, 

580 "dataset_type": result.dataset_type.value, 

581 "question": result.question, 

582 "correct_answer": result.correct_answer, 

583 "model_answer": result.extracted_answer, 

584 "full_response": result.response, 

585 "is_correct": result.is_correct, 

586 "confidence": result.confidence, 

587 "grader_response": result.grader_response, 

588 "processing_time": result.processing_time, 

589 "search_result_count": search_result_count, 

590 "sources": result.sources, 

591 "completed_at": result.completed_at.isoformat() 

592 if result.completed_at 

593 else None, 

594 } 

595 ) 

596 

597 return jsonify({"success": True, "results": formatted_results}) 

598 

599 except Exception: 

600 logger.exception("Error getting benchmark results") 

601 return jsonify( 

602 {"success": False, "error": "An internal error has occurred."} 

603 ), 500 

604 

605 

606@benchmark_bp.route( 

607 "/api/results/<int:benchmark_run_id>/export", methods=["GET"] 

608) 

609@login_required 

610def export_benchmark_results(benchmark_run_id: int): 

611 """Get lightweight results for YAML export (no full_response/sources/grader_response).""" 

612 try: 

613 from sqlalchemy.orm import load_only 

614 

615 from ...database.models.benchmark import BenchmarkResult 

616 from ...database.session_context import get_user_db_session 

617 from flask import session as flask_session 

618 

619 username = flask_session.get("username") 

620 logger.info( 

621 "Exporting benchmark results for run {} by user {}", 

622 benchmark_run_id, 

623 username, 

624 ) 

625 with get_user_db_session(username) as session: 

626 results = ( 

627 session.query(BenchmarkResult) 

628 .options( 

629 load_only( 

630 BenchmarkResult.example_id, 

631 BenchmarkResult.dataset_type, 

632 BenchmarkResult.question, 

633 BenchmarkResult.correct_answer, 

634 BenchmarkResult.extracted_answer, 

635 BenchmarkResult.is_correct, 

636 BenchmarkResult.confidence, 

637 BenchmarkResult.processing_time, 

638 BenchmarkResult.completed_at, 

639 ) 

640 ) 

641 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id) 

642 .order_by(BenchmarkResult.id.asc()) 

643 .all() 

644 ) 

645 

646 formatted = [] 

647 for r in results: 

648 formatted.append( 

649 { 

650 "example_id": r.example_id, 

651 "dataset_type": r.dataset_type.value, 

652 "question": r.question, 

653 "correct_answer": r.correct_answer, 

654 "model_answer": r.extracted_answer, 

655 "is_correct": r.is_correct, 

656 "confidence": r.confidence, 

657 "processing_time": r.processing_time, 

658 "completed_at": r.completed_at.isoformat() 

659 if r.completed_at 

660 else None, 

661 } 

662 ) 

663 

664 logger.info( 

665 "Exported {} results for benchmark run {}", 

666 len(formatted), 

667 benchmark_run_id, 

668 ) 

669 return jsonify({"success": True, "results": formatted}) 

670 

671 except Exception: 

672 logger.exception("Error exporting benchmark results") 

673 return jsonify( 

674 {"success": False, "error": "An internal error has occurred."} 

675 ), 500 

676 

677 

678@benchmark_bp.route("/api/configs", methods=["GET"]) 

679@login_required 

680def get_saved_configs(): 

681 """Get list of saved benchmark configurations.""" 

682 try: 

683 # TODO: Implement saved configs retrieval from database 

684 # For now return default configs 

685 default_configs = [ 

686 { 

687 "id": 1, 

688 "name": "Quick Test", 

689 "description": "Fast benchmark with minimal examples", 

690 "search_config": { 

691 "iterations": 3, 

692 "questions_per_iteration": 3, 

693 "search_tool": "searxng", 

694 "search_strategy": "focused_iteration", 

695 }, 

696 "datasets_config": { 

697 "simpleqa": {"count": 10}, 

698 "browsecomp": {"count": 5}, 

699 }, 

700 }, 

701 { 

702 "id": 2, 

703 "name": "Standard Evaluation", 

704 "description": "Comprehensive benchmark with standard settings", 

705 "search_config": { 

706 "iterations": 8, 

707 "questions_per_iteration": 5, 

708 "search_tool": "searxng", 

709 "search_strategy": "focused_iteration", 

710 }, 

711 "datasets_config": { 

712 "simpleqa": {"count": 50}, 

713 "browsecomp": {"count": 25}, 

714 }, 

715 }, 

716 ] 

717 

718 return jsonify({"success": True, "configs": default_configs}) 

719 

720 except Exception: 

721 logger.exception("Error getting saved configs") 

722 return jsonify( 

723 {"success": False, "error": "An internal error has occurred."} 

724 ), 500 

725 

726 

727@benchmark_bp.route("/api/start-simple", methods=["POST"]) 

728@login_required 

729def start_benchmark_simple(): 

730 """Start a benchmark using current database settings.""" 

731 try: 

732 data = request.get_json() 

733 datasets_config = data.get("datasets_config", {}) 

734 

735 # Validate datasets 

736 if not datasets_config or not any( 736 ↛ 746line 736 didn't jump to line 746 because the condition on line 736 was always true

737 config.get("count", 0) > 0 for config in datasets_config.values() 

738 ): 

739 return jsonify( 

740 { 

741 "error": "At least one dataset with count > 0 must be specified" 

742 } 

743 ), 400 

744 

745 # Get current settings from database 

746 from flask import session as flask_session 

747 

748 username = flask_session.get("username") 

749 session_id = flask_session.get("session_id") 

750 

751 # Try to get password from session store for background thread 

752 from ...database.session_passwords import session_password_store 

753 

754 user_password = None 

755 if session_id: 

756 user_password = session_password_store.get_session_password( 

757 username, session_id 

758 ) 

759 

760 with get_user_db_session(username, user_password) as session: 

761 # For benchmarks, use a default test username 

762 settings_manager = SettingsManager(session) 

763 

764 # Build search config from database settings 

765 search_config = { 

766 "iterations": int( 

767 settings_manager.get_setting("search.iterations", 8) 

768 ), 

769 "questions_per_iteration": int( 

770 settings_manager.get_setting( 

771 "search.questions_per_iteration", 5 

772 ) 

773 ), 

774 "search_tool": settings_manager.get_setting( 

775 "search.tool", "searxng" 

776 ), 

777 "search_strategy": settings_manager.get_setting( 

778 "search.search_strategy", "focused_iteration" 

779 ), 

780 "model_name": settings_manager.get_setting("llm.model"), 

781 "provider": settings_manager.get_setting("llm.provider"), 

782 "temperature": float( 

783 settings_manager.get_setting("llm.temperature", 0.7) 

784 ), 

785 "max_tokens": settings_manager.get_setting( 

786 "llm.max_tokens", 30000 

787 ), 

788 "context_window_unrestricted": settings_manager.get_setting( 

789 "llm.context_window_unrestricted", True 

790 ), 

791 "context_window_size": settings_manager.get_setting( 

792 "llm.context_window_size", 128000 

793 ), 

794 "local_context_window_size": settings_manager.get_setting( 

795 "llm.local_context_window_size", 4096 

796 ), 

797 } 

798 

799 # Add provider-specific settings 

800 provider = search_config.get("provider") 

801 if provider == "openai_endpoint": 

802 search_config["openai_endpoint_url"] = ( 

803 settings_manager.get_setting("llm.openai_endpoint.url") 

804 ) 

805 search_config["openai_endpoint_api_key"] = ( 

806 settings_manager.get_setting("llm.openai_endpoint.api_key") 

807 ) 

808 elif provider == "openai": 

809 search_config["openai_api_key"] = settings_manager.get_setting( 

810 "llm.openai.api_key" 

811 ) 

812 elif provider == "anthropic": 

813 search_config["anthropic_api_key"] = ( 

814 settings_manager.get_setting("llm.anthropic.api_key") 

815 ) 

816 

817 # Read evaluation config from database settings 

818 evaluation_provider = settings_manager.get_setting( 

819 "benchmark.evaluation.provider", "openai_endpoint" 

820 ) 

821 evaluation_model = settings_manager.get_setting( 

822 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet" 

823 ) 

824 evaluation_temperature = float( 

825 settings_manager.get_setting( 

826 "benchmark.evaluation.temperature", 0 

827 ) 

828 ) 

829 

830 evaluation_config = { 

831 "provider": evaluation_provider, 

832 "model_name": evaluation_model, 

833 "temperature": evaluation_temperature, 

834 } 

835 

836 # Add provider-specific settings for evaluation 

837 if evaluation_provider == "openai_endpoint": 

838 evaluation_config["openai_endpoint_url"] = ( 

839 settings_manager.get_setting( 

840 "benchmark.evaluation.endpoint_url", 

841 "https://openrouter.ai/api/v1", 

842 ) 

843 ) 

844 evaluation_config["openai_endpoint_api_key"] = ( 

845 settings_manager.get_setting("llm.openai_endpoint.api_key") 

846 ) 

847 elif evaluation_provider == "openai": 

848 evaluation_config["openai_api_key"] = ( 

849 settings_manager.get_setting("llm.openai.api_key") 

850 ) 

851 elif evaluation_provider == "anthropic": 

852 evaluation_config["anthropic_api_key"] = ( 

853 settings_manager.get_setting("llm.anthropic.api_key") 

854 ) 

855 

856 # Create and start benchmark 

857 benchmark_run_id = benchmark_service.create_benchmark_run( 

858 run_name=f"Quick Benchmark - {data.get('run_name', '')}", 

859 search_config=search_config, 

860 evaluation_config=evaluation_config, 

861 datasets_config=datasets_config, 

862 username=username, 

863 user_password=user_password, 

864 ) 

865 

866 success = benchmark_service.start_benchmark( 

867 benchmark_run_id, username, user_password 

868 ) 

869 

870 if success: 

871 return jsonify( 

872 { 

873 "success": True, 

874 "benchmark_run_id": benchmark_run_id, 

875 "message": "Benchmark started with current settings", 

876 } 

877 ) 

878 else: 

879 return jsonify( 

880 {"success": False, "error": "Failed to start benchmark"} 

881 ), 500 

882 

883 except Exception: 

884 logger.exception("Error starting simple benchmark") 

885 return jsonify( 

886 {"success": False, "error": "An internal error has occurred."} 

887 ), 500 

888 

889 

890@benchmark_bp.route("/api/validate-config", methods=["POST"]) 

891@login_required 

892def validate_config(): 

893 """Validate a benchmark configuration.""" 

894 try: 

895 data = request.get_json() 

896 

897 if not data: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true

898 return jsonify({"valid": False, "errors": ["No data provided"]}) 

899 

900 errors = [] 

901 

902 # Validate search config 

903 search_config = data.get("search_config", {}) 

904 if not search_config.get("search_tool"): 904 ↛ 906line 904 didn't jump to line 906 because the condition on line 904 was always true

905 errors.append("Search tool is required") 

906 if not search_config.get("search_strategy"): 906 ↛ 910line 906 didn't jump to line 910 because the condition on line 906 was always true

907 errors.append("Search strategy is required") 

908 

909 # Validate datasets config 

910 datasets_config = data.get("datasets_config", {}) 

911 if not datasets_config: 911 ↛ 914line 911 didn't jump to line 914 because the condition on line 911 was always true

912 errors.append("At least one dataset must be configured") 

913 

914 total_examples = sum( 

915 config.get("count", 0) for config in datasets_config.values() 

916 ) 

917 if total_examples == 0: 917 ↛ 920line 917 didn't jump to line 920 because the condition on line 917 was always true

918 errors.append("Total examples must be greater than 0") 

919 

920 if total_examples > 1000: 920 ↛ 921line 920 didn't jump to line 921 because the condition on line 920 was never true

921 errors.append( 

922 "Total examples should not exceed 1000 for web interface" 

923 ) 

924 

925 return jsonify( 

926 { 

927 "valid": len(errors) == 0, 

928 "errors": errors, 

929 "total_examples": total_examples, 

930 } 

931 ) 

932 

933 except Exception: 

934 logger.exception("Error validating config") 

935 return jsonify( 

936 {"valid": False, "errors": ["An internal error has occurred."]} 

937 ), 500 

938 

939 

940@benchmark_bp.route("/api/search-quality", methods=["GET"]) 

941@limiter.exempt 

942@login_required 

943def get_search_quality(): 

944 """Get current search quality metrics from rate limiting tracker.""" 

945 try: 

946 from ...web_search_engines.rate_limiting import get_tracker 

947 

948 tracker = get_tracker() 

949 quality_stats = tracker.get_search_quality_stats() 

950 

951 return jsonify( 

952 { 

953 "success": True, 

954 "search_quality": quality_stats, 

955 "timestamp": time.time(), 

956 } 

957 ) 

958 

959 except Exception: 

960 logger.exception("Error getting search quality") 

961 return jsonify( 

962 {"success": False, "error": "An internal error has occurred."} 

963 ), 500 

964 

965 

966@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"]) 

967@login_required 

968def delete_benchmark_run(benchmark_run_id: int): 

969 """Delete a benchmark run and all its results.""" 

970 try: 

971 from ...database.models.benchmark import ( 

972 BenchmarkProgress, 

973 BenchmarkResult, 

974 BenchmarkRun, 

975 ) 

976 from ...database.session_context import get_user_db_session 

977 from flask import session as flask_session 

978 

979 username = flask_session.get("username") 

980 with get_user_db_session(username) as session: 

981 # Check if benchmark run exists 

982 benchmark_run = ( 

983 session.query(BenchmarkRun) 

984 .filter(BenchmarkRun.id == benchmark_run_id) 

985 .first() 

986 ) 

987 

988 if not benchmark_run: 

989 return jsonify( 

990 {"success": False, "error": "Benchmark run not found"} 

991 ), 404 

992 

993 # Prevent deletion of running benchmarks 

994 if benchmark_run.status.value == "in_progress": 

995 return jsonify( 

996 { 

997 "success": False, 

998 "error": "Cannot delete a running benchmark. Cancel it first.", 

999 } 

1000 ), 400 

1001 

1002 # Delete related records (cascade should handle this, but being explicit) 

1003 session.query(BenchmarkResult).filter( 

1004 BenchmarkResult.benchmark_run_id == benchmark_run_id 

1005 ).delete() 

1006 

1007 session.query(BenchmarkProgress).filter( 

1008 BenchmarkProgress.benchmark_run_id == benchmark_run_id 

1009 ).delete() 

1010 

1011 # Delete the benchmark run 

1012 session.delete(benchmark_run) 

1013 session.commit() 

1014 

1015 logger.info(f"Deleted benchmark run {benchmark_run_id}") 

1016 return jsonify( 

1017 { 

1018 "success": True, 

1019 "message": f"Benchmark run {benchmark_run_id} deleted successfully", 

1020 } 

1021 ) 

1022 

1023 except Exception: 

1024 logger.exception(f"Error deleting benchmark run {benchmark_run_id}") 

1025 return jsonify( 

1026 {"success": False, "error": "An internal error has occurred."} 

1027 ), 500