Coverage for src/local_deep_research/benchmarks/web_api/benchmark

1"""Flask routes for benchmark web interface."""

3import time

5from flask import Blueprint, jsonify, request

6from loguru import logger

8from ...database.session_context import get_user_db_session

9from ...web.auth.decorators import login_required

10from ...web.utils.rate_limiter import limiter

11from local_deep_research.settings import SettingsManager

12from ...web.utils.templates import render_template_with_defaults

13from .benchmark_service import benchmark_service

15# Create blueprint for benchmark routes

16benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark")

19@benchmark_bp.route("/")

20@login_required

21def index():

22 """Benchmark dashboard page."""

23 from flask import session as flask_session

25 username = flask_session.get("username")

26 with get_user_db_session(username) as db_session:

27 settings_manager = SettingsManager(db_session)

29 # Load evaluation settings from database

30 eval_settings = {

31 "evaluation_provider": settings_manager.get_setting(

32 "benchmark.evaluation.provider", "openai_endpoint"

33 ),

34 "evaluation_model": settings_manager.get_setting(

35 "benchmark.evaluation.model", ""

36 ),

37 "evaluation_endpoint_url": settings_manager.get_setting(

38 "benchmark.evaluation.endpoint_url", ""

39 ),

40 "evaluation_temperature": settings_manager.get_setting(

41 "benchmark.evaluation.temperature", 0

42 ),

43 }

45 return render_template_with_defaults(

46 "pages/benchmark.html", eval_settings=eval_settings

47 )

50@benchmark_bp.route("/results")

51@login_required

52def results():

53 """Benchmark results history page."""

54 return render_template_with_defaults("pages/benchmark_results.html")

57@benchmark_bp.route("/api/start", methods=["POST"])

58@login_required

59def start_benchmark():

60 """Start a new benchmark run."""

61 try:

62 data = request.get_json()

64 if not data: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 return jsonify({"error": "No data provided"}), 400

67 # Extract configuration

68 run_name = data.get("run_name")

70 # Get search config from database instead of request

71 from ...database.session_context import get_user_db_session

72 from local_deep_research.settings import SettingsManager

73 from flask import session as flask_session

75 username = flask_session.get("username")

76 session_id = flask_session.get("session_id")

78 # Try to get password from session store for background thread

79 from ...database.session_passwords import session_password_store

81 user_password = None

82 if session_id: 82 ↛ 87line 82 didn't jump to line 87 because the condition on line 82 was always true

83 user_password = session_password_store.get_session_password(

84 username, session_id

85 )

87 search_config = {}

88 evaluation_config = {}

89 datasets_config = data.get("datasets_config", {})

91 with get_user_db_session(username) as db_session:

92 # Use the logged-in user's settings

93 settings_manager = SettingsManager(db_session)

95 # Build search config from database settings

96 search_config = {

97 "iterations": int(

98 settings_manager.get_setting("search.iterations", 8)

99 ),

100 "questions_per_iteration": int(

101 settings_manager.get_setting(

102 "search.questions_per_iteration", 5

103 )

104 ),

105 "search_tool": settings_manager.get_setting(

106 "search.tool", "searxng"

107 ),

108 "search_strategy": settings_manager.get_setting(

109 "search.search_strategy", "focused_iteration"

110 ),

111 "model_name": settings_manager.get_setting("llm.model"),

112 "provider": settings_manager.get_setting("llm.provider"),

113 "temperature": float(

114 settings_manager.get_setting("llm.temperature", 0.7)

115 ),

116 "max_tokens": settings_manager.get_setting(

117 "llm.max_tokens", 30000

118 ),

119 "context_window_unrestricted": settings_manager.get_setting(

120 "llm.context_window_unrestricted", True

121 ),

122 "context_window_size": settings_manager.get_setting(

123 "llm.context_window_size", 128000

124 ),

125 "local_context_window_size": settings_manager.get_setting(

126 "llm.local_context_window_size", 4096

127 ),

128 }

129

130 # Add provider-specific settings

131 provider = search_config.get("provider")

132 if provider == "openai_endpoint": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 search_config["openai_endpoint_url"] = (

134 settings_manager.get_setting("llm.openai_endpoint.url")

135 )

136 search_config["openai_endpoint_api_key"] = (

137 settings_manager.get_setting("llm.openai_endpoint.api_key")

138 )

139 elif provider == "openai": 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true

140 search_config["openai_api_key"] = settings_manager.get_setting(

141 "llm.openai.api_key"

142 )

143 elif provider == "anthropic":

144 search_config["anthropic_api_key"] = (

145 settings_manager.get_setting("llm.anthropic.api_key")

146 )

147

148 # Get evaluation config from database settings or request

149 if "evaluation_config" in data: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 evaluation_config = data["evaluation_config"]

151 else:

152 # Read evaluation config from database settings

153 evaluation_provider = settings_manager.get_setting(

154 "benchmark.evaluation.provider", "openai_endpoint"

155 )

156 evaluation_model = settings_manager.get_setting(

157 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"

158 )

159 evaluation_temperature = float(

160 settings_manager.get_setting(

161 "benchmark.evaluation.temperature", 0

162 )

163 )

164

165 evaluation_config = {

166 "provider": evaluation_provider,

167 "model_name": evaluation_model,

168 "temperature": evaluation_temperature,

169 }

170

171 # Add provider-specific settings for evaluation

172 if evaluation_provider == "openai_endpoint": 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true

173 evaluation_config["openai_endpoint_url"] = (

174 settings_manager.get_setting(

175 "benchmark.evaluation.endpoint_url",

176 "https://openrouter.ai/api/v1",

177 )

178 )

179 evaluation_config["openai_endpoint_api_key"] = (

180 settings_manager.get_setting(

181 "llm.openai_endpoint.api_key"

182 )

183 )

184 elif evaluation_provider == "openai": 184 ↛ 188line 184 didn't jump to line 188 because the condition on line 184 was always true

185 evaluation_config["openai_api_key"] = (

186 settings_manager.get_setting("llm.openai.api_key")

187 )

188 elif evaluation_provider == "anthropic":

189 evaluation_config["anthropic_api_key"] = (

190 settings_manager.get_setting("llm.anthropic.api_key")

191 )

192

193 # Validate datasets config

194 if not datasets_config or not any( 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was never true

195 config.get("count", 0) > 0 for config in datasets_config.values()

196 ):

197 return jsonify(

198 {

199 "error": "At least one dataset with count > 0 must be specified"

200 }

201 ), 400

202

203 # Create benchmark run

204 benchmark_run_id = benchmark_service.create_benchmark_run(

205 run_name=run_name,

206 search_config=search_config,

207 evaluation_config=evaluation_config,

208 datasets_config=datasets_config,

209 username=username,

210 user_password=user_password,

211 )

212

213 # Start benchmark

214 success = benchmark_service.start_benchmark(

215 benchmark_run_id, username, user_password

216 )

217

218 if success: 218 ↛ 227line 218 didn't jump to line 227 because the condition on line 218 was always true

219 return jsonify(

220 {

221 "success": True,

222 "benchmark_run_id": benchmark_run_id,

223 "message": "Benchmark started successfully",

224 }

225 )

226 else:

227 return jsonify(

228 {"success": False, "error": "Failed to start benchmark"}

229 ), 500

230

231 except Exception:

232 logger.exception("Error starting benchmark")

233 return jsonify(

234 {"success": False, "error": "An internal error has occurred."}

235 ), 500

236

237

238@benchmark_bp.route("/api/running", methods=["GET"])

239@login_required

240def get_running_benchmark():

241 """Check if there's a running benchmark and return its ID."""

242 try:

243 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus

244 from ...database.session_context import get_user_db_session

245 from flask import session as flask_session

246

247 username = flask_session.get("username")

248 with get_user_db_session(username) as session:

249 # Find any benchmark that's currently running

250 running_benchmark = (

251 session.query(BenchmarkRun)

252 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS)

253 .order_by(BenchmarkRun.created_at.desc())

254 .first()

255 )

256

257 if running_benchmark: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 return jsonify(

259 {

260 "success": True,

261 "benchmark_run_id": running_benchmark.id,

262 "run_name": running_benchmark.run_name,

263 "total_examples": running_benchmark.total_examples,

264 "completed_examples": running_benchmark.completed_examples,

265 }

266 )

267 else:

268 return jsonify(

269 {"success": False, "message": "No running benchmark found"}

270 )

271

272 except Exception:

273 logger.exception("Error checking for running benchmark")

274 return jsonify(

275 {"success": False, "error": "An internal error has occurred."}

276 ), 500

277

278

279@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"])

280@limiter.exempt

281@login_required

282def get_benchmark_status(benchmark_run_id: int):

283 """Get status of a benchmark run."""

284 try:

285 from flask import session as flask_session

286

287 username = flask_session.get("username")

288 status = benchmark_service.get_benchmark_status(

289 benchmark_run_id, username

290 )

291

292 if status:

293 logger.info(

294 f"Returning status for benchmark {benchmark_run_id}: "

295 f"completed={status.get('completed_examples')}, "

296 f"overall_acc={status.get('overall_accuracy')}, "

297 f"avg_time={status.get('avg_time_per_example')}, "

298 f"estimated_remaining={status.get('estimated_time_remaining')}"

299 )

300 return jsonify({"success": True, "status": status})

301 else:

302 return jsonify(

303 {"success": False, "error": "Benchmark run not found"}

304 ), 404

305

306 except Exception:

307 logger.exception("Error getting benchmark status")

308 return jsonify(

309 {"success": False, "error": "An internal error has occurred."}

310 ), 500

311

312

313@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"])

314@login_required

315def cancel_benchmark(benchmark_run_id: int):

316 """Cancel a running benchmark."""

317 try:

318 from flask import session as flask_session

319

320 username = flask_session.get("username")

321 success = benchmark_service.cancel_benchmark(benchmark_run_id, username)

322

323 if success:

324 return jsonify(

325 {"success": True, "message": "Benchmark cancelled successfully"}

326 )

327 else:

328 return jsonify(

329 {"success": False, "error": "Failed to cancel benchmark"}

330 ), 500

331

332 except Exception:

333 logger.exception("Error cancelling benchmark")

334 return jsonify(

335 {"success": False, "error": "An internal error has occurred."}

336 ), 500

337

338

339@benchmark_bp.route("/api/history", methods=["GET"])

340@login_required

341def get_benchmark_history():

342 """Get list of recent benchmark runs."""

343 try:

344 from ...database.models.benchmark import BenchmarkRun

345 from ...database.session_context import get_user_db_session

346 from flask import session as flask_session

347

348 username = flask_session.get("username")

349 with get_user_db_session(username) as session:

350 # Get all benchmark runs (completed, failed, cancelled, or in-progress)

351 runs = (

352 session.query(BenchmarkRun)

353 .order_by(BenchmarkRun.created_at.desc())

354 .limit(50)

355 .all()

356 )

357

358 # Format runs for display

359 formatted_runs = []

360 for run in runs: 360 ↛ 362line 360 didn't jump to line 362 because the loop on line 360 never started

361 # Calculate average processing time from results

362 avg_processing_time = None

363 avg_search_results = None

364 try:

365 from sqlalchemy import func

366

367 from ...database.models.benchmark import BenchmarkResult

368

369 avg_result = (

370 session.query(func.avg(BenchmarkResult.processing_time))

371 .filter(

372 BenchmarkResult.benchmark_run_id == run.id,

373 BenchmarkResult.processing_time.isnot(None),

374 BenchmarkResult.processing_time > 0,

375 )

376 .scalar()

377 )

378

379 if avg_result:

380 avg_processing_time = float(avg_result)

381 except Exception as e:

382 logger.warning(

383 f"Error calculating avg processing time for run {run.id}: {e}"

384 )

385

386 # Calculate average search results and total search requests from metrics

387 total_search_requests = None

388 try:

389 from ...database.models import SearchCall

390 from ...metrics.search_tracker import get_search_tracker

391

392 # Get all results for this run to find research_ids

393 results = (

394 session.query(BenchmarkResult)

395 .filter(BenchmarkResult.benchmark_run_id == run.id)

396 .all()

397 )

398

399 research_ids = [

400 r.research_id for r in results if r.research_id

401 ]

402

403 if research_ids:

404 tracker = get_search_tracker()

405 with tracker.db.get_session() as metric_session:

406 # Get all search calls for these research_ids

407 search_calls = (

408 metric_session.query(SearchCall)

409 .filter(

410 SearchCall.research_id.in_(research_ids)

411 )

412 .all()

413 )

414

415 # Group by research_id and calculate metrics per research session

416 research_results = {}

417 research_requests = {}

418

419 for call in search_calls:

420 if call.research_id:

421 if call.research_id not in research_results:

422 research_results[call.research_id] = 0

423 research_requests[call.research_id] = 0

424 research_results[call.research_id] += (

425 call.results_count or 0

426 )

427 research_requests[call.research_id] += 1

428

429 # Calculate averages across research sessions

430 if research_results:

431 total_results = sum(research_results.values())

432 avg_search_results = total_results / len(

433 research_results

434 )

435

436 total_requests = sum(research_requests.values())

437 total_search_requests = total_requests / len(

438 research_requests

439 )

440

441 except Exception as e:

442 logger.warning(

443 f"Error calculating search metrics for run {run.id}: {e}"

444 )

445

446 formatted_runs.append(

447 {

448 "id": run.id,

449 "run_name": run.run_name or f"Benchmark #{run.id}",

450 "created_at": run.created_at.isoformat(),

451 "total_examples": run.total_examples,

452 "completed_examples": run.completed_examples,

453 "overall_accuracy": run.overall_accuracy,

454 "status": run.status.value,

455 "search_config": run.search_config,

456 "evaluation_config": run.evaluation_config,

457 "datasets_config": run.datasets_config,

458 "avg_processing_time": avg_processing_time,

459 "avg_search_results": avg_search_results,

460 "total_search_requests": total_search_requests,

461 }

462 )

463

464 return jsonify({"success": True, "runs": formatted_runs})

465

466 except Exception:

467 logger.exception("Error getting benchmark history")

468 return jsonify(

469 {"success": False, "error": "An internal error has occurred."}

470 ), 500

471

472

473@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"])

474@limiter.exempt

475@login_required

476def get_benchmark_results(benchmark_run_id: int):

477 """Get detailed results for a benchmark run."""

478 try:

479 from ...database.models.benchmark import BenchmarkResult

480 from ...database.session_context import get_user_db_session

481 from flask import session as flask_session

482

483 logger.info(f"Getting results for benchmark {benchmark_run_id}")

484 username = flask_session.get("username")

485

486 # First sync any pending results from active runs

487 benchmark_service.sync_pending_results(benchmark_run_id, username)

488 with get_user_db_session(username) as session:

489 # Get recent results (limit to last 10)

490 limit = int(request.args.get("limit", 10))

491

492 results = (

493 session.query(BenchmarkResult)

494 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)

495 # Temporarily show all results including pending evaluations

496 # .filter(

497 # BenchmarkResult.is_correct.isnot(None)

498 # ) # Only completed evaluations

499 .order_by(BenchmarkResult.id.desc()) # Most recent first

500 .limit(limit)

501 .all()

502 )

503

504 logger.info(f"Found {len(results)} results")

505

506 # Build a map of research_id to total search results

507 search_results_by_research_id = {}

508 try:

509 from ...database.models import SearchCall

510 from ...metrics.search_tracker import get_search_tracker

511

512 tracker = get_search_tracker()

513

514 # Get all unique research_ids from our results

515 research_ids = [r.research_id for r in results if r.research_id]

516

517 if research_ids:

518 with tracker.db.get_session() as metric_session:

519 # Get all search calls for these research_ids

520 all_search_calls = (

521 metric_session.query(SearchCall)

522 .filter(SearchCall.research_id.in_(research_ids))

523 .all()

524 )

525

526 # Group search results by research_id

527 for call in all_search_calls:

528 if call.research_id:

529 if (

530 call.research_id

531 not in search_results_by_research_id

532 ):

533 search_results_by_research_id[

534 call.research_id

535 ] = 0

536 search_results_by_research_id[

537 call.research_id

538 ] += call.results_count or 0

539

540 logger.info(

541 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls"

542 )

543 logger.debug(

544 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}"

545 )

546 logger.debug(

547 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}"

548 )

549 except Exception:

550 logger.exception(

551 f"Error getting search metrics for benchmark {benchmark_run_id}"

552 )

553

554 # Format results for UI display

555 formatted_results = []

556 for result in results:

557 # Get search result count using research_id

558 search_result_count = 0

559

560 try:

561 if (

562 result.research_id

563 and result.research_id in search_results_by_research_id

564 ):

565 search_result_count = search_results_by_research_id[

566 result.research_id

567 ]

568 logger.debug(

569 f"Found {search_result_count} search results for research_id {result.research_id}"

570 )

571

572 except Exception:

573 logger.exception(

574 f"Error getting search results for result {result.example_id}"

575 )

576

577 formatted_results.append(

578 {

579 "example_id": result.example_id,

580 "dataset_type": result.dataset_type.value,

581 "question": result.question,

582 "correct_answer": result.correct_answer,

583 "model_answer": result.extracted_answer,

584 "full_response": result.response,

585 "is_correct": result.is_correct,

586 "confidence": result.confidence,

587 "grader_response": result.grader_response,

588 "processing_time": result.processing_time,

589 "search_result_count": search_result_count,

590 "sources": result.sources,

591 "completed_at": result.completed_at.isoformat()

592 if result.completed_at

593 else None,

594 }

595 )

596

597 return jsonify({"success": True, "results": formatted_results})

598

599 except Exception:

600 logger.exception("Error getting benchmark results")

601 return jsonify(

602 {"success": False, "error": "An internal error has occurred."}

603 ), 500

604

605

606@benchmark_bp.route(

607 "/api/results/<int:benchmark_run_id>/export", methods=["GET"]

608)

609@login_required

610def export_benchmark_results(benchmark_run_id: int):

611 """Get lightweight results for YAML export (no full_response/sources/grader_response)."""

612 try:

613 from sqlalchemy.orm import load_only

614

615 from ...database.models.benchmark import BenchmarkResult

616 from ...database.session_context import get_user_db_session

617 from flask import session as flask_session

618

619 username = flask_session.get("username")

620 logger.info(

621 "Exporting benchmark results for run {} by user {}",

622 benchmark_run_id,

623 username,

624 )

625 with get_user_db_session(username) as session:

626 results = (

627 session.query(BenchmarkResult)

628 .options(

629 load_only(

630 BenchmarkResult.example_id,

631 BenchmarkResult.dataset_type,

632 BenchmarkResult.question,

633 BenchmarkResult.correct_answer,

634 BenchmarkResult.extracted_answer,

635 BenchmarkResult.is_correct,

636 BenchmarkResult.confidence,

637 BenchmarkResult.processing_time,

638 BenchmarkResult.completed_at,

639 )

640 )

641 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)

642 .order_by(BenchmarkResult.id.asc())

643 .all()

644 )

645

646 formatted = []

647 for r in results:

648 formatted.append(

649 {

650 "example_id": r.example_id,

651 "dataset_type": r.dataset_type.value,

652 "question": r.question,

653 "correct_answer": r.correct_answer,

654 "model_answer": r.extracted_answer,

655 "is_correct": r.is_correct,

656 "confidence": r.confidence,

657 "processing_time": r.processing_time,

658 "completed_at": r.completed_at.isoformat()

659 if r.completed_at

660 else None,

661 }

662 )

663

664 logger.info(

665 "Exported {} results for benchmark run {}",

666 len(formatted),

667 benchmark_run_id,

668 )

669 return jsonify({"success": True, "results": formatted})

670

671 except Exception:

672 logger.exception("Error exporting benchmark results")

673 return jsonify(

674 {"success": False, "error": "An internal error has occurred."}

675 ), 500

676

677

678@benchmark_bp.route("/api/configs", methods=["GET"])

679@login_required

680def get_saved_configs():

681 """Get list of saved benchmark configurations."""

682 try:

683 # TODO: Implement saved configs retrieval from database

684 # For now return default configs

685 default_configs = [

686 {

687 "id": 1,

688 "name": "Quick Test",

689 "description": "Fast benchmark with minimal examples",

690 "search_config": {

691 "iterations": 3,

692 "questions_per_iteration": 3,

693 "search_tool": "searxng",

694 "search_strategy": "focused_iteration",

695 },

696 "datasets_config": {

697 "simpleqa": {"count": 10},

698 "browsecomp": {"count": 5},

699 },

700 },

701 {

702 "id": 2,

703 "name": "Standard Evaluation",

704 "description": "Comprehensive benchmark with standard settings",

705 "search_config": {

706 "iterations": 8,

707 "questions_per_iteration": 5,

708 "search_tool": "searxng",

709 "search_strategy": "focused_iteration",

710 },

711 "datasets_config": {

712 "simpleqa": {"count": 50},

713 "browsecomp": {"count": 25},

714 },

715 },

716 ]

717

718 return jsonify({"success": True, "configs": default_configs})

719

720 except Exception:

721 logger.exception("Error getting saved configs")

722 return jsonify(

723 {"success": False, "error": "An internal error has occurred."}

724 ), 500

725

726

727@benchmark_bp.route("/api/start-simple", methods=["POST"])

728@login_required

729def start_benchmark_simple():

730 """Start a benchmark using current database settings."""

731 try:

732 data = request.get_json()

733 datasets_config = data.get("datasets_config", {})

734

735 # Validate datasets

736 if not datasets_config or not any( 736 ↛ 746line 736 didn't jump to line 746 because the condition on line 736 was always true

737 config.get("count", 0) > 0 for config in datasets_config.values()

738 ):

739 return jsonify(

740 {

741 "error": "At least one dataset with count > 0 must be specified"

742 }

743 ), 400

744

745 # Get current settings from database

746 from flask import session as flask_session

747

748 username = flask_session.get("username")

749 session_id = flask_session.get("session_id")

750

751 # Try to get password from session store for background thread

752 from ...database.session_passwords import session_password_store

753

754 user_password = None

755 if session_id:

756 user_password = session_password_store.get_session_password(

757 username, session_id

758 )

759

760 with get_user_db_session(username, user_password) as session:

761 # For benchmarks, use a default test username

762 settings_manager = SettingsManager(session)

763

764 # Build search config from database settings

765 search_config = {

766 "iterations": int(

767 settings_manager.get_setting("search.iterations", 8)

768 ),

769 "questions_per_iteration": int(

770 settings_manager.get_setting(

771 "search.questions_per_iteration", 5

772 )

773 ),

774 "search_tool": settings_manager.get_setting(

775 "search.tool", "searxng"

776 ),

777 "search_strategy": settings_manager.get_setting(

778 "search.search_strategy", "focused_iteration"

779 ),

780 "model_name": settings_manager.get_setting("llm.model"),

781 "provider": settings_manager.get_setting("llm.provider"),

782 "temperature": float(

783 settings_manager.get_setting("llm.temperature", 0.7)

784 ),

785 "max_tokens": settings_manager.get_setting(

786 "llm.max_tokens", 30000

787 ),

788 "context_window_unrestricted": settings_manager.get_setting(

789 "llm.context_window_unrestricted", True

790 ),

791 "context_window_size": settings_manager.get_setting(

792 "llm.context_window_size", 128000

793 ),

794 "local_context_window_size": settings_manager.get_setting(

795 "llm.local_context_window_size", 4096

796 ),

797 }

798

799 # Add provider-specific settings

800 provider = search_config.get("provider")

801 if provider == "openai_endpoint":

802 search_config["openai_endpoint_url"] = (

803 settings_manager.get_setting("llm.openai_endpoint.url")

804 )

805 search_config["openai_endpoint_api_key"] = (

806 settings_manager.get_setting("llm.openai_endpoint.api_key")

807 )

808 elif provider == "openai":

809 search_config["openai_api_key"] = settings_manager.get_setting(

810 "llm.openai.api_key"

811 )

812 elif provider == "anthropic":

813 search_config["anthropic_api_key"] = (

814 settings_manager.get_setting("llm.anthropic.api_key")

815 )

816

817 # Read evaluation config from database settings

818 evaluation_provider = settings_manager.get_setting(

819 "benchmark.evaluation.provider", "openai_endpoint"

820 )

821 evaluation_model = settings_manager.get_setting(

822 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"

823 )

824 evaluation_temperature = float(

825 settings_manager.get_setting(

826 "benchmark.evaluation.temperature", 0

827 )

828 )

829

830 evaluation_config = {

831 "provider": evaluation_provider,

832 "model_name": evaluation_model,

833 "temperature": evaluation_temperature,

834 }

835

836 # Add provider-specific settings for evaluation

837 if evaluation_provider == "openai_endpoint":

838 evaluation_config["openai_endpoint_url"] = (

839 settings_manager.get_setting(

840 "benchmark.evaluation.endpoint_url",

841 "https://openrouter.ai/api/v1",

842 )

843 )

844 evaluation_config["openai_endpoint_api_key"] = (

845 settings_manager.get_setting("llm.openai_endpoint.api_key")

846 )

847 elif evaluation_provider == "openai":

848 evaluation_config["openai_api_key"] = (

849 settings_manager.get_setting("llm.openai.api_key")

850 )

851 elif evaluation_provider == "anthropic":

852 evaluation_config["anthropic_api_key"] = (

853 settings_manager.get_setting("llm.anthropic.api_key")

854 )

855

856 # Create and start benchmark

857 benchmark_run_id = benchmark_service.create_benchmark_run(

858 run_name=f"Quick Benchmark - {data.get('run_name', '')}",

859 search_config=search_config,

860 evaluation_config=evaluation_config,

861 datasets_config=datasets_config,

862 username=username,

863 user_password=user_password,

864 )

865

866 success = benchmark_service.start_benchmark(

867 benchmark_run_id, username, user_password

868 )

869

870 if success:

871 return jsonify(

872 {

873 "success": True,

874 "benchmark_run_id": benchmark_run_id,

875 "message": "Benchmark started with current settings",

876 }

877 )

878 else:

879 return jsonify(

880 {"success": False, "error": "Failed to start benchmark"}

881 ), 500

882

883 except Exception:

884 logger.exception("Error starting simple benchmark")

885 return jsonify(

886 {"success": False, "error": "An internal error has occurred."}

887 ), 500

888

889

890@benchmark_bp.route("/api/validate-config", methods=["POST"])

891@login_required

892def validate_config():

893 """Validate a benchmark configuration."""

894 try:

895 data = request.get_json()

896

897 if not data: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true

898 return jsonify({"valid": False, "errors": ["No data provided"]})

899

900 errors = []

901

902 # Validate search config

903 search_config = data.get("search_config", {})

904 if not search_config.get("search_tool"): 904 ↛ 906line 904 didn't jump to line 906 because the condition on line 904 was always true

905 errors.append("Search tool is required")

906 if not search_config.get("search_strategy"): 906 ↛ 910line 906 didn't jump to line 910 because the condition on line 906 was always true

907 errors.append("Search strategy is required")

908

909 # Validate datasets config

910 datasets_config = data.get("datasets_config", {})

911 if not datasets_config: 911 ↛ 914line 911 didn't jump to line 914 because the condition on line 911 was always true

912 errors.append("At least one dataset must be configured")

913

914 total_examples = sum(

915 config.get("count", 0) for config in datasets_config.values()

916 )

917 if total_examples == 0: 917 ↛ 920line 917 didn't jump to line 920 because the condition on line 917 was always true

918 errors.append("Total examples must be greater than 0")

919

920 if total_examples > 1000: 920 ↛ 921line 920 didn't jump to line 921 because the condition on line 920 was never true

921 errors.append(

922 "Total examples should not exceed 1000 for web interface"

923 )

924

925 return jsonify(

926 {

927 "valid": len(errors) == 0,

928 "errors": errors,

929 "total_examples": total_examples,

930 }

931 )

932

933 except Exception:

934 logger.exception("Error validating config")

935 return jsonify(

936 {"valid": False, "errors": ["An internal error has occurred."]}

937 ), 500

938

939

940@benchmark_bp.route("/api/search-quality", methods=["GET"])

941@limiter.exempt

942@login_required

943def get_search_quality():

944 """Get current search quality metrics from rate limiting tracker."""

945 try:

946 from ...web_search_engines.rate_limiting import get_tracker

947

948 tracker = get_tracker()

949 quality_stats = tracker.get_search_quality_stats()

950

951 return jsonify(

952 {

953 "success": True,

954 "search_quality": quality_stats,

955 "timestamp": time.time(),

956 }

957 )

958

959 except Exception:

960 logger.exception("Error getting search quality")

961 return jsonify(

962 {"success": False, "error": "An internal error has occurred."}

963 ), 500

964

965

966@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"])

967@login_required

968def delete_benchmark_run(benchmark_run_id: int):

969 """Delete a benchmark run and all its results."""

970 try:

971 from ...database.models.benchmark import (

972 BenchmarkProgress,

973 BenchmarkResult,

974 BenchmarkRun,

975 )

976 from ...database.session_context import get_user_db_session

977 from flask import session as flask_session

978

979 username = flask_session.get("username")

980 with get_user_db_session(username) as session:

981 # Check if benchmark run exists

982 benchmark_run = (

983 session.query(BenchmarkRun)

984 .filter(BenchmarkRun.id == benchmark_run_id)

985 .first()

986 )

987

988 if not benchmark_run:

989 return jsonify(

990 {"success": False, "error": "Benchmark run not found"}

991 ), 404

992

993 # Prevent deletion of running benchmarks

994 if benchmark_run.status.value == "in_progress":

995 return jsonify(

996 {

997 "success": False,

998 "error": "Cannot delete a running benchmark. Cancel it first.",

999 }

1000 ), 400

1001

1002 # Delete related records (cascade should handle this, but being explicit)

1003 session.query(BenchmarkResult).filter(

1004 BenchmarkResult.benchmark_run_id == benchmark_run_id

1005 ).delete()

1006

1007 session.query(BenchmarkProgress).filter(

1008 BenchmarkProgress.benchmark_run_id == benchmark_run_id

1009 ).delete()

1010

1011 # Delete the benchmark run

1012 session.delete(benchmark_run)

1013 session.commit()

1014

1015 logger.info(f"Deleted benchmark run {benchmark_run_id}")

1016 return jsonify(

1017 {

1018 "success": True,

1019 "message": f"Benchmark run {benchmark_run_id} deleted successfully",

1020 }

1021 )

1022

1023 except Exception:

1024 logger.exception(f"Error deleting benchmark run {benchmark_run_id}")

1025 return jsonify(

1026 {"success": False, "error": "An internal error has occurred."}

1027 ), 500

Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 37%

356 statements