Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 96%
352 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Flask routes for benchmark web interface."""
3import time
5from flask import Blueprint, jsonify, request
6from loguru import logger
8from ...database.session_context import get_user_db_session
9from ...security.decorators import require_json_body
10from ...web.auth.decorators import login_required
11from ...security.rate_limiter import limiter
12from local_deep_research.settings import SettingsManager
13from ...llm.providers.base import normalize_provider
14from ...web.utils.templates import render_template_with_defaults
15from .benchmark_service import benchmark_service
17# Create blueprint for benchmark routes
18benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark")
20# NOTE: Routes use flask_session["username"] (not .get()) intentionally.
21# @login_required guarantees the key exists; direct access fails fast
22# if the decorator is ever removed.
25@benchmark_bp.route("/")
26@login_required
27def index():
28 """Benchmark dashboard page."""
29 from flask import session as flask_session
31 username = flask_session["username"]
32 with get_user_db_session(username) as db_session:
33 settings_manager = SettingsManager(db_session)
35 # Load evaluation settings from database
36 eval_settings = {
37 "evaluation_provider": settings_manager.get_setting(
38 "benchmark.evaluation.provider", "openai_endpoint"
39 ),
40 "evaluation_model": settings_manager.get_setting(
41 "benchmark.evaluation.model", ""
42 ),
43 "evaluation_endpoint_url": settings_manager.get_setting(
44 "benchmark.evaluation.endpoint_url", ""
45 ),
46 "evaluation_temperature": settings_manager.get_setting(
47 "benchmark.evaluation.temperature", 0
48 ),
49 }
51 return render_template_with_defaults(
52 "pages/benchmark.html", eval_settings=eval_settings
53 )
56@benchmark_bp.route("/results")
57@login_required
58def results():
59 """Benchmark results history page."""
60 return render_template_with_defaults("pages/benchmark_results.html")
63@benchmark_bp.route("/api/start", methods=["POST"])
64@login_required
65@require_json_body(error_message="No data provided")
66def start_benchmark():
67 """Start a new benchmark run."""
68 try:
69 data = request.get_json()
71 # Extract configuration
72 run_name = data.get("run_name")
74 # Get search config from database instead of request
75 from ...database.session_context import get_user_db_session
76 from local_deep_research.settings import SettingsManager
77 from flask import session as flask_session
79 username = flask_session["username"]
80 session_id = flask_session.get("session_id")
82 # Try to get password from session store for background thread
83 from ...database.session_passwords import session_password_store
85 user_password = None
86 if session_id:
87 user_password = session_password_store.get_session_password(
88 username, session_id
89 )
91 search_config = {}
92 evaluation_config = {}
93 datasets_config = data.get("datasets_config", {})
95 with get_user_db_session(username) as db_session:
96 # Use the logged-in user's settings
97 settings_manager = SettingsManager(db_session)
99 # Build search config from database settings
100 search_config = {
101 "iterations": int(
102 settings_manager.get_setting("search.iterations", 8)
103 ),
104 "questions_per_iteration": int(
105 settings_manager.get_setting(
106 "search.questions_per_iteration", 5
107 )
108 ),
109 "search_tool": settings_manager.get_setting(
110 "search.tool", "searxng"
111 ),
112 "search_strategy": settings_manager.get_setting(
113 "search.search_strategy", "focused_iteration"
114 ),
115 "model_name": settings_manager.get_setting("llm.model"),
116 "provider": settings_manager.get_setting("llm.provider"),
117 "temperature": float(
118 settings_manager.get_setting("llm.temperature", 0.7)
119 ),
120 "max_tokens": settings_manager.get_setting(
121 "llm.max_tokens", 30000
122 ),
123 "context_window_unrestricted": settings_manager.get_setting(
124 "llm.context_window_unrestricted", True
125 ),
126 "context_window_size": settings_manager.get_setting(
127 "llm.context_window_size", 128000
128 ),
129 "local_context_window_size": settings_manager.get_setting(
130 "llm.local_context_window_size", 8192
131 ),
132 }
134 # Add provider-specific settings
135 provider = normalize_provider(search_config.get("provider"))
136 if provider == "openai_endpoint":
137 search_config["openai_endpoint_url"] = (
138 settings_manager.get_setting("llm.openai_endpoint.url")
139 )
140 search_config["openai_endpoint_api_key"] = (
141 settings_manager.get_setting("llm.openai_endpoint.api_key")
142 )
143 elif provider == "openai":
144 search_config["openai_api_key"] = settings_manager.get_setting(
145 "llm.openai.api_key"
146 )
147 elif provider == "anthropic": 147 ↛ 153line 147 didn't jump to line 153 because the condition on line 147 was always true
148 search_config["anthropic_api_key"] = (
149 settings_manager.get_setting("llm.anthropic.api_key")
150 )
152 # Get evaluation config from database settings or request
153 if "evaluation_config" in data:
154 evaluation_config = data["evaluation_config"]
155 else:
156 # Read evaluation config from database settings
157 evaluation_provider = normalize_provider(
158 settings_manager.get_setting(
159 "benchmark.evaluation.provider", "openai_endpoint"
160 )
161 )
162 evaluation_model = settings_manager.get_setting(
163 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
164 )
165 evaluation_temperature = float(
166 settings_manager.get_setting(
167 "benchmark.evaluation.temperature", 0
168 )
169 )
171 evaluation_config = {
172 "provider": evaluation_provider,
173 "model_name": evaluation_model,
174 "temperature": evaluation_temperature,
175 }
177 # Add provider-specific settings for evaluation
178 if evaluation_provider == "openai_endpoint":
179 evaluation_config["openai_endpoint_url"] = (
180 settings_manager.get_setting(
181 "benchmark.evaluation.endpoint_url",
182 "https://openrouter.ai/api/v1",
183 )
184 )
185 evaluation_config["openai_endpoint_api_key"] = (
186 settings_manager.get_setting(
187 "llm.openai_endpoint.api_key"
188 )
189 )
190 elif evaluation_provider == "openai":
191 evaluation_config["openai_api_key"] = (
192 settings_manager.get_setting("llm.openai.api_key")
193 )
194 elif evaluation_provider == "anthropic": 194 ↛ 200line 194 didn't jump to line 200
195 evaluation_config["anthropic_api_key"] = (
196 settings_manager.get_setting("llm.anthropic.api_key")
197 )
199 # Validate datasets config
200 if not datasets_config or not any(
201 config.get("count", 0) > 0 for config in datasets_config.values()
202 ):
203 return jsonify(
204 {
205 "error": "At least one dataset with count > 0 must be specified"
206 }
207 ), 400
209 # Create benchmark run
210 benchmark_run_id = benchmark_service.create_benchmark_run(
211 run_name=run_name,
212 search_config=search_config,
213 evaluation_config=evaluation_config,
214 datasets_config=datasets_config,
215 username=username,
216 user_password=user_password,
217 )
219 # Start benchmark
220 success = benchmark_service.start_benchmark(
221 benchmark_run_id, username, user_password
222 )
224 if success:
225 return jsonify(
226 {
227 "success": True,
228 "benchmark_run_id": benchmark_run_id,
229 "message": "Benchmark started successfully",
230 }
231 )
232 return jsonify(
233 {"success": False, "error": "Failed to start benchmark"}
234 ), 500
236 except Exception:
237 logger.exception("Error starting benchmark")
238 return jsonify(
239 {"success": False, "error": "An internal error has occurred."}
240 ), 500
243@benchmark_bp.route("/api/running", methods=["GET"])
244@login_required
245def get_running_benchmark():
246 """Check if there's a running benchmark and return its ID."""
247 try:
248 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus
249 from ...database.session_context import get_user_db_session
250 from flask import session as flask_session
252 username = flask_session["username"]
253 with get_user_db_session(username) as session:
254 # Find any benchmark that's currently running
255 running_benchmark = (
256 session.query(BenchmarkRun)
257 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS)
258 .order_by(BenchmarkRun.created_at.desc())
259 .first()
260 )
262 if running_benchmark:
263 return jsonify(
264 {
265 "success": True,
266 "benchmark_run_id": running_benchmark.id,
267 "run_name": running_benchmark.run_name,
268 "total_examples": running_benchmark.total_examples,
269 "completed_examples": running_benchmark.completed_examples,
270 }
271 )
272 return jsonify(
273 {"success": False, "message": "No running benchmark found"}
274 )
276 except Exception:
277 logger.exception("Error checking for running benchmark")
278 return jsonify(
279 {"success": False, "error": "An internal error has occurred."}
280 ), 500
283@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"])
284@limiter.exempt
285@login_required
286def get_benchmark_status(benchmark_run_id: int):
287 """Get status of a benchmark run."""
288 try:
289 from flask import session as flask_session
291 username = flask_session["username"]
292 status = benchmark_service.get_benchmark_status(
293 benchmark_run_id, username
294 )
296 if status:
297 logger.info(
298 f"Returning status for benchmark {benchmark_run_id}: "
299 f"completed={status.get('completed_examples')}, "
300 f"overall_acc={status.get('overall_accuracy')}, "
301 f"avg_time={status.get('avg_time_per_example')}, "
302 f"estimated_remaining={status.get('estimated_time_remaining')}"
303 )
304 return jsonify({"success": True, "status": status})
305 return jsonify(
306 {"success": False, "error": "Benchmark run not found"}
307 ), 404
309 except Exception:
310 logger.exception("Error getting benchmark status")
311 return jsonify(
312 {"success": False, "error": "An internal error has occurred."}
313 ), 500
316@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"])
317@login_required
318def cancel_benchmark(benchmark_run_id: int):
319 """Cancel a running benchmark."""
320 try:
321 from flask import session as flask_session
323 username = flask_session["username"]
324 success = benchmark_service.cancel_benchmark(benchmark_run_id, username)
326 if success:
327 return jsonify(
328 {"success": True, "message": "Benchmark cancelled successfully"}
329 )
330 return jsonify(
331 {"success": False, "error": "Failed to cancel benchmark"}
332 ), 500
334 except Exception:
335 logger.exception("Error cancelling benchmark")
336 return jsonify(
337 {"success": False, "error": "An internal error has occurred."}
338 ), 500
341@benchmark_bp.route("/api/history", methods=["GET"])
342@login_required
343def get_benchmark_history():
344 """Get list of recent benchmark runs."""
345 try:
346 from ...database.models.benchmark import BenchmarkRun
347 from ...database.session_context import get_user_db_session
348 from flask import session as flask_session
350 username = flask_session["username"]
351 with get_user_db_session(username) as session:
352 # Get all benchmark runs (completed, failed, cancelled, or in-progress)
353 runs = (
354 session.query(BenchmarkRun)
355 .order_by(BenchmarkRun.created_at.desc())
356 .limit(50)
357 .all()
358 )
360 # Format runs for display
361 formatted_runs = []
362 for run in runs:
363 # Calculate average processing time from results
364 avg_processing_time = None
365 avg_search_results = None
366 try:
367 from sqlalchemy import func
369 from ...database.models.benchmark import BenchmarkResult
371 avg_result = (
372 session.query(func.avg(BenchmarkResult.processing_time))
373 .filter(
374 BenchmarkResult.benchmark_run_id == run.id,
375 BenchmarkResult.processing_time.isnot(None),
376 BenchmarkResult.processing_time > 0,
377 )
378 .scalar()
379 )
381 if avg_result:
382 avg_processing_time = float(avg_result)
383 except Exception:
384 logger.warning(
385 f"Error calculating avg processing time for run {run.id}"
386 )
388 # Calculate average search results and total search requests from metrics
389 total_search_requests = None
390 try:
391 from ...database.models import SearchCall
393 # Get all results for this run to find research_ids
394 results = (
395 session.query(BenchmarkResult)
396 .filter(BenchmarkResult.benchmark_run_id == run.id)
397 .all()
398 )
400 research_ids = [
401 r.research_id for r in results if r.research_id
402 ]
404 if research_ids:
405 # SearchCall is in the same per-user DB, query directly
406 search_calls = (
407 session.query(SearchCall)
408 .filter(SearchCall.research_id.in_(research_ids))
409 .all()
410 )
412 # Group by research_id and calculate metrics per research session
413 research_results = {}
414 research_requests = {}
416 for call in search_calls:
417 if call.research_id: 417 ↛ 416line 417 didn't jump to line 416 because the condition on line 417 was always true
418 if call.research_id not in research_results: 418 ↛ 421line 418 didn't jump to line 421 because the condition on line 418 was always true
419 research_results[call.research_id] = 0
420 research_requests[call.research_id] = 0
421 research_results[call.research_id] += (
422 call.results_count or 0
423 )
424 research_requests[call.research_id] += 1
426 # Calculate averages across research sessions
427 if research_results: 427 ↛ 443line 427 didn't jump to line 443 because the condition on line 427 was always true
428 total_results = sum(research_results.values())
429 avg_search_results = total_results / len(
430 research_results
431 )
433 total_requests = sum(research_requests.values())
434 total_search_requests = total_requests / len(
435 research_requests
436 )
438 except Exception:
439 logger.warning(
440 f"Error calculating search metrics for run {run.id}"
441 )
443 formatted_runs.append(
444 {
445 "id": run.id,
446 "run_name": run.run_name or f"Benchmark #{run.id}",
447 "created_at": run.created_at.isoformat(),
448 "total_examples": run.total_examples,
449 "completed_examples": run.completed_examples,
450 "overall_accuracy": run.overall_accuracy,
451 "status": run.status.value,
452 "search_config": run.search_config,
453 "evaluation_config": run.evaluation_config,
454 "datasets_config": run.datasets_config,
455 "avg_processing_time": avg_processing_time,
456 "avg_search_results": avg_search_results,
457 "total_search_requests": total_search_requests,
458 }
459 )
461 return jsonify({"success": True, "runs": formatted_runs})
463 except Exception:
464 logger.exception("Error getting benchmark history")
465 return jsonify(
466 {"success": False, "error": "An internal error has occurred."}
467 ), 500
470@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"])
471@limiter.exempt
472@login_required
473def get_benchmark_results(benchmark_run_id: int):
474 """Get detailed results for a benchmark run."""
475 try:
476 from ...database.models.benchmark import BenchmarkResult
477 from ...database.session_context import get_user_db_session
478 from flask import session as flask_session
480 logger.info(f"Getting results for benchmark {benchmark_run_id}")
481 username = flask_session["username"]
483 # First sync any pending results from active runs
484 benchmark_service.sync_pending_results(benchmark_run_id, username)
485 with get_user_db_session(username) as session:
486 # Get recent results (limit to last 10)
487 limit = int(request.args.get("limit", 10))
489 results = (
490 session.query(BenchmarkResult)
491 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
492 .order_by(BenchmarkResult.id.desc()) # Most recent first
493 .limit(limit)
494 .all()
495 )
497 logger.info(f"Found {len(results)} results")
499 # Build a map of research_id to total search results
500 search_results_by_research_id = {}
501 try:
502 from ...database.models import SearchCall
504 # Get all unique research_ids from our results
505 research_ids = [r.research_id for r in results if r.research_id]
507 if research_ids:
508 # SearchCall is in the same per-user DB, query directly
509 all_search_calls = (
510 session.query(SearchCall)
511 .filter(SearchCall.research_id.in_(research_ids))
512 .all()
513 )
515 # Group search results by research_id
516 for call in all_search_calls:
517 if call.research_id: 517 ↛ 516line 517 didn't jump to line 516 because the condition on line 517 was always true
518 if ( 518 ↛ 525line 518 didn't jump to line 525 because the condition on line 518 was always true
519 call.research_id
520 not in search_results_by_research_id
521 ):
522 search_results_by_research_id[
523 call.research_id
524 ] = 0
525 search_results_by_research_id[call.research_id] += (
526 call.results_count or 0
527 )
529 logger.info(
530 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls"
531 )
532 logger.debug(
533 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}"
534 )
535 logger.debug(
536 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}"
537 )
538 except Exception:
539 logger.exception(
540 f"Error getting search metrics for benchmark {benchmark_run_id}"
541 )
543 # Format results for UI display
544 formatted_results = []
545 for result in results:
546 # Get search result count using research_id
547 search_result_count = 0
549 try:
550 if (
551 result.research_id
552 and result.research_id in search_results_by_research_id
553 ):
554 search_result_count = search_results_by_research_id[
555 result.research_id
556 ]
557 logger.debug(
558 f"Found {search_result_count} search results for research_id {result.research_id}"
559 )
561 except Exception:
562 logger.exception(
563 f"Error getting search results for result {result.example_id}"
564 )
566 formatted_results.append(
567 {
568 "example_id": result.example_id,
569 "dataset_type": result.dataset_type.value,
570 "question": result.question,
571 "correct_answer": result.correct_answer,
572 "model_answer": result.extracted_answer,
573 "full_response": result.response,
574 "is_correct": result.is_correct,
575 "confidence": result.confidence,
576 "grader_response": result.grader_response,
577 "processing_time": result.processing_time,
578 "search_result_count": search_result_count,
579 "sources": result.sources,
580 "completed_at": result.completed_at.isoformat()
581 if result.completed_at
582 else None,
583 }
584 )
586 return jsonify({"success": True, "results": formatted_results})
588 except Exception:
589 logger.exception("Error getting benchmark results")
590 return jsonify(
591 {"success": False, "error": "An internal error has occurred."}
592 ), 500
595@benchmark_bp.route(
596 "/api/results/<int:benchmark_run_id>/export", methods=["GET"]
597)
598@login_required
599def export_benchmark_results(benchmark_run_id: int):
600 """Get lightweight results for YAML export (no full_response/sources/grader_response)."""
601 try:
602 from sqlalchemy.orm import load_only
604 from ...database.models.benchmark import BenchmarkResult
605 from ...database.session_context import get_user_db_session
606 from flask import session as flask_session
608 username = flask_session["username"]
609 logger.info(
610 "Exporting benchmark results for run {} by user {}",
611 benchmark_run_id,
612 username,
613 )
614 with get_user_db_session(username) as session:
615 results = (
616 session.query(BenchmarkResult)
617 .options(
618 load_only(
619 BenchmarkResult.example_id,
620 BenchmarkResult.dataset_type,
621 BenchmarkResult.question,
622 BenchmarkResult.correct_answer,
623 BenchmarkResult.extracted_answer,
624 BenchmarkResult.is_correct,
625 BenchmarkResult.confidence,
626 BenchmarkResult.processing_time,
627 BenchmarkResult.completed_at,
628 )
629 )
630 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
631 .order_by(BenchmarkResult.id.asc())
632 .all()
633 )
635 formatted = []
636 for r in results:
637 formatted.append(
638 {
639 "example_id": r.example_id,
640 "dataset_type": r.dataset_type.value,
641 "question": r.question,
642 "correct_answer": r.correct_answer,
643 "model_answer": r.extracted_answer,
644 "is_correct": r.is_correct,
645 "confidence": r.confidence,
646 "processing_time": r.processing_time,
647 "completed_at": r.completed_at.isoformat()
648 if r.completed_at
649 else None,
650 }
651 )
653 logger.info(
654 "Exported {} results for benchmark run {}",
655 len(formatted),
656 benchmark_run_id,
657 )
658 return jsonify({"success": True, "results": formatted})
660 except Exception:
661 logger.exception("Error exporting benchmark results")
662 return jsonify(
663 {"success": False, "error": "An internal error has occurred."}
664 ), 500
667@benchmark_bp.route("/api/configs", methods=["GET"])
668@login_required
669def get_saved_configs():
670 """Get list of saved benchmark configurations."""
671 try:
672 # TODO: Implement saved configs retrieval from database
673 # For now return default configs
674 default_configs = [
675 {
676 "id": 1,
677 "name": "Quick Test",
678 "description": "Fast benchmark with minimal examples",
679 "search_config": {
680 "iterations": 3,
681 "questions_per_iteration": 3,
682 "search_tool": "searxng",
683 "search_strategy": "focused_iteration",
684 },
685 "datasets_config": {
686 "simpleqa": {"count": 10},
687 "browsecomp": {"count": 5},
688 },
689 },
690 {
691 "id": 2,
692 "name": "Standard Evaluation",
693 "description": "Comprehensive benchmark with standard settings",
694 "search_config": {
695 "iterations": 8,
696 "questions_per_iteration": 5,
697 "search_tool": "searxng",
698 "search_strategy": "focused_iteration",
699 },
700 "datasets_config": {
701 "simpleqa": {"count": 50},
702 "browsecomp": {"count": 25},
703 },
704 },
705 ]
707 return jsonify({"success": True, "configs": default_configs})
709 except Exception:
710 logger.exception("Error getting saved configs")
711 return jsonify(
712 {"success": False, "error": "An internal error has occurred."}
713 ), 500
716@benchmark_bp.route("/api/start-simple", methods=["POST"])
717@login_required
718@require_json_body()
719def start_benchmark_simple():
720 """Start a benchmark using current database settings."""
721 try:
722 data = request.get_json()
723 datasets_config = data.get("datasets_config", {})
725 # Validate datasets
726 if not datasets_config or not any(
727 config.get("count", 0) > 0 for config in datasets_config.values()
728 ):
729 return jsonify(
730 {
731 "error": "At least one dataset with count > 0 must be specified"
732 }
733 ), 400
735 # Get current settings from database
736 from flask import session as flask_session
738 username = flask_session["username"]
739 session_id = flask_session.get("session_id")
741 # Try to get password from session store for background thread
742 from ...database.session_passwords import session_password_store
744 user_password = None
745 if session_id:
746 user_password = session_password_store.get_session_password(
747 username, session_id
748 )
750 with get_user_db_session(username, user_password) as session:
751 # For benchmarks, use a default test username
752 settings_manager = SettingsManager(session)
754 # Build search config from database settings
755 search_config = {
756 "iterations": int(
757 settings_manager.get_setting("search.iterations", 8)
758 ),
759 "questions_per_iteration": int(
760 settings_manager.get_setting(
761 "search.questions_per_iteration", 5
762 )
763 ),
764 "search_tool": settings_manager.get_setting(
765 "search.tool", "searxng"
766 ),
767 "search_strategy": settings_manager.get_setting(
768 "search.search_strategy", "focused_iteration"
769 ),
770 "model_name": settings_manager.get_setting("llm.model"),
771 "provider": settings_manager.get_setting("llm.provider"),
772 "temperature": float(
773 settings_manager.get_setting("llm.temperature", 0.7)
774 ),
775 "max_tokens": settings_manager.get_setting(
776 "llm.max_tokens", 30000
777 ),
778 "context_window_unrestricted": settings_manager.get_setting(
779 "llm.context_window_unrestricted", True
780 ),
781 "context_window_size": settings_manager.get_setting(
782 "llm.context_window_size", 128000
783 ),
784 "local_context_window_size": settings_manager.get_setting(
785 "llm.local_context_window_size", 8192
786 ),
787 }
789 # Add provider-specific settings
790 provider = normalize_provider(search_config.get("provider"))
791 if provider == "openai_endpoint":
792 search_config["openai_endpoint_url"] = (
793 settings_manager.get_setting("llm.openai_endpoint.url")
794 )
795 search_config["openai_endpoint_api_key"] = (
796 settings_manager.get_setting("llm.openai_endpoint.api_key")
797 )
798 elif provider == "openai":
799 search_config["openai_api_key"] = settings_manager.get_setting(
800 "llm.openai.api_key"
801 )
802 elif provider == "anthropic": 802 ↛ 808line 802 didn't jump to line 808 because the condition on line 802 was always true
803 search_config["anthropic_api_key"] = (
804 settings_manager.get_setting("llm.anthropic.api_key")
805 )
807 # Read evaluation config from database settings
808 evaluation_provider = normalize_provider(
809 settings_manager.get_setting(
810 "benchmark.evaluation.provider", "openai_endpoint"
811 )
812 )
813 evaluation_model = settings_manager.get_setting(
814 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
815 )
816 evaluation_temperature = float(
817 settings_manager.get_setting(
818 "benchmark.evaluation.temperature", 0
819 )
820 )
822 evaluation_config = {
823 "provider": evaluation_provider,
824 "model_name": evaluation_model,
825 "temperature": evaluation_temperature,
826 }
828 # Add provider-specific settings for evaluation
829 if evaluation_provider == "openai_endpoint":
830 evaluation_config["openai_endpoint_url"] = (
831 settings_manager.get_setting(
832 "benchmark.evaluation.endpoint_url",
833 "https://openrouter.ai/api/v1",
834 )
835 )
836 evaluation_config["openai_endpoint_api_key"] = (
837 settings_manager.get_setting("llm.openai_endpoint.api_key")
838 )
839 elif evaluation_provider == "openai":
840 evaluation_config["openai_api_key"] = (
841 settings_manager.get_setting("llm.openai.api_key")
842 )
843 elif evaluation_provider == "anthropic": 843 ↛ 849line 843 didn't jump to line 849
844 evaluation_config["anthropic_api_key"] = (
845 settings_manager.get_setting("llm.anthropic.api_key")
846 )
848 # Create and start benchmark
849 benchmark_run_id = benchmark_service.create_benchmark_run(
850 run_name=f"Quick Benchmark - {data.get('run_name', '')}",
851 search_config=search_config,
852 evaluation_config=evaluation_config,
853 datasets_config=datasets_config,
854 username=username,
855 user_password=user_password,
856 )
858 success = benchmark_service.start_benchmark(
859 benchmark_run_id, username, user_password
860 )
862 if success:
863 return jsonify(
864 {
865 "success": True,
866 "benchmark_run_id": benchmark_run_id,
867 "message": "Benchmark started with current settings",
868 }
869 )
870 return jsonify(
871 {"success": False, "error": "Failed to start benchmark"}
872 ), 500
874 except Exception:
875 logger.exception("Error starting simple benchmark")
876 return jsonify(
877 {"success": False, "error": "An internal error has occurred."}
878 ), 500
881@benchmark_bp.route("/api/validate-config", methods=["POST"])
882@login_required
883def validate_config():
884 """Validate a benchmark configuration.
886 Note: not using @require_json_body because this endpoint returns
887 {"valid": False, "errors": [...]} which doesn't match the decorator's
888 three standard error formats.
889 """
890 try:
891 data = request.get_json()
893 if not isinstance(data, dict):
894 return jsonify({"valid": False, "errors": ["No data provided"]})
896 errors = []
898 # Validate search config
899 search_config = data.get("search_config", {})
900 if not search_config.get("search_tool"):
901 errors.append("Search tool is required")
902 if not search_config.get("search_strategy"):
903 errors.append("Search strategy is required")
905 # Validate datasets config
906 datasets_config = data.get("datasets_config", {})
907 if not datasets_config:
908 errors.append("At least one dataset must be configured")
910 total_examples = sum(
911 config.get("count", 0) for config in datasets_config.values()
912 )
913 if total_examples == 0:
914 errors.append("Total examples must be greater than 0")
916 if total_examples > 1000:
917 errors.append(
918 "Total examples should not exceed 1000 for web interface"
919 )
921 return jsonify(
922 {
923 "valid": len(errors) == 0,
924 "errors": errors,
925 "total_examples": total_examples,
926 }
927 )
929 except Exception:
930 logger.exception("Error validating config")
931 return jsonify(
932 {"valid": False, "errors": ["An internal error has occurred."]}
933 ), 500
936@benchmark_bp.route("/api/search-quality", methods=["GET"])
937@limiter.exempt
938@login_required
939def get_search_quality():
940 """Get current search quality metrics from rate limiting tracker."""
941 try:
942 from ...web_search_engines.rate_limiting import get_tracker
944 tracker = get_tracker()
945 quality_stats = tracker.get_search_quality_stats()
947 return jsonify(
948 {
949 "success": True,
950 "search_quality": quality_stats,
951 "timestamp": time.time(),
952 }
953 )
955 except Exception:
956 logger.exception("Error getting search quality")
957 return jsonify(
958 {"success": False, "error": "An internal error has occurred."}
959 ), 500
962@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"])
963@login_required
964def delete_benchmark_run(benchmark_run_id: int):
965 """Delete a benchmark run and all its results."""
966 try:
967 from ...database.models.benchmark import (
968 BenchmarkProgress,
969 BenchmarkResult,
970 BenchmarkRun,
971 )
972 from ...database.session_context import get_user_db_session
973 from flask import session as flask_session
975 username = flask_session["username"]
976 with get_user_db_session(username) as session:
977 # Check if benchmark run exists
978 benchmark_run = (
979 session.query(BenchmarkRun)
980 .filter(BenchmarkRun.id == benchmark_run_id)
981 .first()
982 )
984 if not benchmark_run:
985 return jsonify(
986 {"success": False, "error": "Benchmark run not found"}
987 ), 404
989 # Prevent deletion of running benchmarks
990 if benchmark_run.status.value == "in_progress":
991 return jsonify(
992 {
993 "success": False,
994 "error": "Cannot delete a running benchmark. Cancel it first.",
995 }
996 ), 400
998 # Delete related records (cascade should handle this, but being explicit)
999 session.query(BenchmarkResult).filter(
1000 BenchmarkResult.benchmark_run_id == benchmark_run_id
1001 ).delete()
1003 session.query(BenchmarkProgress).filter(
1004 BenchmarkProgress.benchmark_run_id == benchmark_run_id
1005 ).delete()
1007 # Delete the benchmark run
1008 session.delete(benchmark_run)
1009 session.commit()
1011 logger.info(f"Deleted benchmark run {benchmark_run_id}")
1012 return jsonify(
1013 {
1014 "success": True,
1015 "message": f"Benchmark run {benchmark_run_id} deleted successfully",
1016 }
1017 )
1019 except Exception:
1020 logger.exception(f"Error deleting benchmark run {benchmark_run_id}")
1021 return jsonify(
1022 {"success": False, "error": "An internal error has occurred."}
1023 ), 500