Coverage for src / local_deep_research / benchmarks / web_api / benchmark_routes.py: 37%
356 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""Flask routes for benchmark web interface."""
3import time
5from flask import Blueprint, jsonify, request
6from loguru import logger
8from ...database.session_context import get_user_db_session
9from ...web.auth.decorators import login_required
10from ...web.utils.rate_limiter import limiter
11from local_deep_research.settings import SettingsManager
12from ...web.utils.templates import render_template_with_defaults
13from .benchmark_service import benchmark_service
15# Create blueprint for benchmark routes
16benchmark_bp = Blueprint("benchmark", __name__, url_prefix="/benchmark")
19@benchmark_bp.route("/")
20@login_required
21def index():
22 """Benchmark dashboard page."""
23 from flask import session as flask_session
25 username = flask_session.get("username")
26 with get_user_db_session(username) as db_session:
27 settings_manager = SettingsManager(db_session)
29 # Load evaluation settings from database
30 eval_settings = {
31 "evaluation_provider": settings_manager.get_setting(
32 "benchmark.evaluation.provider", "openai_endpoint"
33 ),
34 "evaluation_model": settings_manager.get_setting(
35 "benchmark.evaluation.model", ""
36 ),
37 "evaluation_endpoint_url": settings_manager.get_setting(
38 "benchmark.evaluation.endpoint_url", ""
39 ),
40 "evaluation_temperature": settings_manager.get_setting(
41 "benchmark.evaluation.temperature", 0
42 ),
43 }
45 return render_template_with_defaults(
46 "pages/benchmark.html", eval_settings=eval_settings
47 )
50@benchmark_bp.route("/results")
51@login_required
52def results():
53 """Benchmark results history page."""
54 return render_template_with_defaults("pages/benchmark_results.html")
57@benchmark_bp.route("/api/start", methods=["POST"])
58@login_required
59def start_benchmark():
60 """Start a new benchmark run."""
61 try:
62 data = request.get_json()
64 if not data: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true
65 return jsonify({"error": "No data provided"}), 400
67 # Extract configuration
68 run_name = data.get("run_name")
70 # Get search config from database instead of request
71 from ...database.session_context import get_user_db_session
72 from local_deep_research.settings import SettingsManager
73 from flask import session as flask_session
75 username = flask_session.get("username")
76 session_id = flask_session.get("session_id")
78 # Try to get password from session store for background thread
79 from ...database.session_passwords import session_password_store
81 user_password = None
82 if session_id: 82 ↛ 87line 82 didn't jump to line 87 because the condition on line 82 was always true
83 user_password = session_password_store.get_session_password(
84 username, session_id
85 )
87 search_config = {}
88 evaluation_config = {}
89 datasets_config = data.get("datasets_config", {})
91 with get_user_db_session(username) as db_session:
92 # Use the logged-in user's settings
93 settings_manager = SettingsManager(db_session)
95 # Build search config from database settings
96 search_config = {
97 "iterations": int(
98 settings_manager.get_setting("search.iterations", 8)
99 ),
100 "questions_per_iteration": int(
101 settings_manager.get_setting(
102 "search.questions_per_iteration", 5
103 )
104 ),
105 "search_tool": settings_manager.get_setting(
106 "search.tool", "searxng"
107 ),
108 "search_strategy": settings_manager.get_setting(
109 "search.search_strategy", "focused_iteration"
110 ),
111 "model_name": settings_manager.get_setting("llm.model"),
112 "provider": settings_manager.get_setting("llm.provider"),
113 "temperature": float(
114 settings_manager.get_setting("llm.temperature", 0.7)
115 ),
116 "max_tokens": settings_manager.get_setting(
117 "llm.max_tokens", 30000
118 ),
119 "context_window_unrestricted": settings_manager.get_setting(
120 "llm.context_window_unrestricted", True
121 ),
122 "context_window_size": settings_manager.get_setting(
123 "llm.context_window_size", 128000
124 ),
125 "local_context_window_size": settings_manager.get_setting(
126 "llm.local_context_window_size", 4096
127 ),
128 }
130 # Add provider-specific settings
131 provider = search_config.get("provider")
132 if provider == "openai_endpoint": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 search_config["openai_endpoint_url"] = (
134 settings_manager.get_setting("llm.openai_endpoint.url")
135 )
136 search_config["openai_endpoint_api_key"] = (
137 settings_manager.get_setting("llm.openai_endpoint.api_key")
138 )
139 elif provider == "openai": 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true
140 search_config["openai_api_key"] = settings_manager.get_setting(
141 "llm.openai.api_key"
142 )
143 elif provider == "anthropic":
144 search_config["anthropic_api_key"] = (
145 settings_manager.get_setting("llm.anthropic.api_key")
146 )
148 # Get evaluation config from database settings or request
149 if "evaluation_config" in data: 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 evaluation_config = data["evaluation_config"]
151 else:
152 # Read evaluation config from database settings
153 evaluation_provider = settings_manager.get_setting(
154 "benchmark.evaluation.provider", "openai_endpoint"
155 )
156 evaluation_model = settings_manager.get_setting(
157 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
158 )
159 evaluation_temperature = float(
160 settings_manager.get_setting(
161 "benchmark.evaluation.temperature", 0
162 )
163 )
165 evaluation_config = {
166 "provider": evaluation_provider,
167 "model_name": evaluation_model,
168 "temperature": evaluation_temperature,
169 }
171 # Add provider-specific settings for evaluation
172 if evaluation_provider == "openai_endpoint": 172 ↛ 173line 172 didn't jump to line 173 because the condition on line 172 was never true
173 evaluation_config["openai_endpoint_url"] = (
174 settings_manager.get_setting(
175 "benchmark.evaluation.endpoint_url",
176 "https://openrouter.ai/api/v1",
177 )
178 )
179 evaluation_config["openai_endpoint_api_key"] = (
180 settings_manager.get_setting(
181 "llm.openai_endpoint.api_key"
182 )
183 )
184 elif evaluation_provider == "openai": 184 ↛ 188line 184 didn't jump to line 188 because the condition on line 184 was always true
185 evaluation_config["openai_api_key"] = (
186 settings_manager.get_setting("llm.openai.api_key")
187 )
188 elif evaluation_provider == "anthropic":
189 evaluation_config["anthropic_api_key"] = (
190 settings_manager.get_setting("llm.anthropic.api_key")
191 )
193 # Validate datasets config
194 if not datasets_config or not any( 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was never true
195 config.get("count", 0) > 0 for config in datasets_config.values()
196 ):
197 return jsonify(
198 {
199 "error": "At least one dataset with count > 0 must be specified"
200 }
201 ), 400
203 # Create benchmark run
204 benchmark_run_id = benchmark_service.create_benchmark_run(
205 run_name=run_name,
206 search_config=search_config,
207 evaluation_config=evaluation_config,
208 datasets_config=datasets_config,
209 username=username,
210 user_password=user_password,
211 )
213 # Start benchmark
214 success = benchmark_service.start_benchmark(
215 benchmark_run_id, username, user_password
216 )
218 if success: 218 ↛ 227line 218 didn't jump to line 227 because the condition on line 218 was always true
219 return jsonify(
220 {
221 "success": True,
222 "benchmark_run_id": benchmark_run_id,
223 "message": "Benchmark started successfully",
224 }
225 )
226 else:
227 return jsonify(
228 {"success": False, "error": "Failed to start benchmark"}
229 ), 500
231 except Exception:
232 logger.exception("Error starting benchmark")
233 return jsonify(
234 {"success": False, "error": "An internal error has occurred."}
235 ), 500
238@benchmark_bp.route("/api/running", methods=["GET"])
239@login_required
240def get_running_benchmark():
241 """Check if there's a running benchmark and return its ID."""
242 try:
243 from ...database.models.benchmark import BenchmarkRun, BenchmarkStatus
244 from ...database.session_context import get_user_db_session
245 from flask import session as flask_session
247 username = flask_session.get("username")
248 with get_user_db_session(username) as session:
249 # Find any benchmark that's currently running
250 running_benchmark = (
251 session.query(BenchmarkRun)
252 .filter(BenchmarkRun.status == BenchmarkStatus.IN_PROGRESS)
253 .order_by(BenchmarkRun.created_at.desc())
254 .first()
255 )
257 if running_benchmark: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 return jsonify(
259 {
260 "success": True,
261 "benchmark_run_id": running_benchmark.id,
262 "run_name": running_benchmark.run_name,
263 "total_examples": running_benchmark.total_examples,
264 "completed_examples": running_benchmark.completed_examples,
265 }
266 )
267 else:
268 return jsonify(
269 {"success": False, "message": "No running benchmark found"}
270 )
272 except Exception:
273 logger.exception("Error checking for running benchmark")
274 return jsonify(
275 {"success": False, "error": "An internal error has occurred."}
276 ), 500
279@benchmark_bp.route("/api/status/<int:benchmark_run_id>", methods=["GET"])
280@limiter.exempt
281@login_required
282def get_benchmark_status(benchmark_run_id: int):
283 """Get status of a benchmark run."""
284 try:
285 from flask import session as flask_session
287 username = flask_session.get("username")
288 status = benchmark_service.get_benchmark_status(
289 benchmark_run_id, username
290 )
292 if status:
293 logger.info(
294 f"Returning status for benchmark {benchmark_run_id}: "
295 f"completed={status.get('completed_examples')}, "
296 f"overall_acc={status.get('overall_accuracy')}, "
297 f"avg_time={status.get('avg_time_per_example')}, "
298 f"estimated_remaining={status.get('estimated_time_remaining')}"
299 )
300 return jsonify({"success": True, "status": status})
301 else:
302 return jsonify(
303 {"success": False, "error": "Benchmark run not found"}
304 ), 404
306 except Exception:
307 logger.exception("Error getting benchmark status")
308 return jsonify(
309 {"success": False, "error": "An internal error has occurred."}
310 ), 500
313@benchmark_bp.route("/api/cancel/<int:benchmark_run_id>", methods=["POST"])
314@login_required
315def cancel_benchmark(benchmark_run_id: int):
316 """Cancel a running benchmark."""
317 try:
318 from flask import session as flask_session
320 username = flask_session.get("username")
321 success = benchmark_service.cancel_benchmark(benchmark_run_id, username)
323 if success:
324 return jsonify(
325 {"success": True, "message": "Benchmark cancelled successfully"}
326 )
327 else:
328 return jsonify(
329 {"success": False, "error": "Failed to cancel benchmark"}
330 ), 500
332 except Exception:
333 logger.exception("Error cancelling benchmark")
334 return jsonify(
335 {"success": False, "error": "An internal error has occurred."}
336 ), 500
339@benchmark_bp.route("/api/history", methods=["GET"])
340@login_required
341def get_benchmark_history():
342 """Get list of recent benchmark runs."""
343 try:
344 from ...database.models.benchmark import BenchmarkRun
345 from ...database.session_context import get_user_db_session
346 from flask import session as flask_session
348 username = flask_session.get("username")
349 with get_user_db_session(username) as session:
350 # Get all benchmark runs (completed, failed, cancelled, or in-progress)
351 runs = (
352 session.query(BenchmarkRun)
353 .order_by(BenchmarkRun.created_at.desc())
354 .limit(50)
355 .all()
356 )
358 # Format runs for display
359 formatted_runs = []
360 for run in runs: 360 ↛ 362line 360 didn't jump to line 362 because the loop on line 360 never started
361 # Calculate average processing time from results
362 avg_processing_time = None
363 avg_search_results = None
364 try:
365 from sqlalchemy import func
367 from ...database.models.benchmark import BenchmarkResult
369 avg_result = (
370 session.query(func.avg(BenchmarkResult.processing_time))
371 .filter(
372 BenchmarkResult.benchmark_run_id == run.id,
373 BenchmarkResult.processing_time.isnot(None),
374 BenchmarkResult.processing_time > 0,
375 )
376 .scalar()
377 )
379 if avg_result:
380 avg_processing_time = float(avg_result)
381 except Exception as e:
382 logger.warning(
383 f"Error calculating avg processing time for run {run.id}: {e}"
384 )
386 # Calculate average search results and total search requests from metrics
387 total_search_requests = None
388 try:
389 from ...database.models import SearchCall
390 from ...metrics.search_tracker import get_search_tracker
392 # Get all results for this run to find research_ids
393 results = (
394 session.query(BenchmarkResult)
395 .filter(BenchmarkResult.benchmark_run_id == run.id)
396 .all()
397 )
399 research_ids = [
400 r.research_id for r in results if r.research_id
401 ]
403 if research_ids:
404 tracker = get_search_tracker()
405 with tracker.db.get_session() as metric_session:
406 # Get all search calls for these research_ids
407 search_calls = (
408 metric_session.query(SearchCall)
409 .filter(
410 SearchCall.research_id.in_(research_ids)
411 )
412 .all()
413 )
415 # Group by research_id and calculate metrics per research session
416 research_results = {}
417 research_requests = {}
419 for call in search_calls:
420 if call.research_id:
421 if call.research_id not in research_results:
422 research_results[call.research_id] = 0
423 research_requests[call.research_id] = 0
424 research_results[call.research_id] += (
425 call.results_count or 0
426 )
427 research_requests[call.research_id] += 1
429 # Calculate averages across research sessions
430 if research_results:
431 total_results = sum(research_results.values())
432 avg_search_results = total_results / len(
433 research_results
434 )
436 total_requests = sum(research_requests.values())
437 total_search_requests = total_requests / len(
438 research_requests
439 )
441 except Exception as e:
442 logger.warning(
443 f"Error calculating search metrics for run {run.id}: {e}"
444 )
446 formatted_runs.append(
447 {
448 "id": run.id,
449 "run_name": run.run_name or f"Benchmark #{run.id}",
450 "created_at": run.created_at.isoformat(),
451 "total_examples": run.total_examples,
452 "completed_examples": run.completed_examples,
453 "overall_accuracy": run.overall_accuracy,
454 "status": run.status.value,
455 "search_config": run.search_config,
456 "evaluation_config": run.evaluation_config,
457 "datasets_config": run.datasets_config,
458 "avg_processing_time": avg_processing_time,
459 "avg_search_results": avg_search_results,
460 "total_search_requests": total_search_requests,
461 }
462 )
464 return jsonify({"success": True, "runs": formatted_runs})
466 except Exception:
467 logger.exception("Error getting benchmark history")
468 return jsonify(
469 {"success": False, "error": "An internal error has occurred."}
470 ), 500
473@benchmark_bp.route("/api/results/<int:benchmark_run_id>", methods=["GET"])
474@limiter.exempt
475@login_required
476def get_benchmark_results(benchmark_run_id: int):
477 """Get detailed results for a benchmark run."""
478 try:
479 from ...database.models.benchmark import BenchmarkResult
480 from ...database.session_context import get_user_db_session
481 from flask import session as flask_session
483 logger.info(f"Getting results for benchmark {benchmark_run_id}")
484 username = flask_session.get("username")
486 # First sync any pending results from active runs
487 benchmark_service.sync_pending_results(benchmark_run_id, username)
488 with get_user_db_session(username) as session:
489 # Get recent results (limit to last 10)
490 limit = int(request.args.get("limit", 10))
492 results = (
493 session.query(BenchmarkResult)
494 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
495 # Temporarily show all results including pending evaluations
496 # .filter(
497 # BenchmarkResult.is_correct.isnot(None)
498 # ) # Only completed evaluations
499 .order_by(BenchmarkResult.id.desc()) # Most recent first
500 .limit(limit)
501 .all()
502 )
504 logger.info(f"Found {len(results)} results")
506 # Build a map of research_id to total search results
507 search_results_by_research_id = {}
508 try:
509 from ...database.models import SearchCall
510 from ...metrics.search_tracker import get_search_tracker
512 tracker = get_search_tracker()
514 # Get all unique research_ids from our results
515 research_ids = [r.research_id for r in results if r.research_id]
517 if research_ids:
518 with tracker.db.get_session() as metric_session:
519 # Get all search calls for these research_ids
520 all_search_calls = (
521 metric_session.query(SearchCall)
522 .filter(SearchCall.research_id.in_(research_ids))
523 .all()
524 )
526 # Group search results by research_id
527 for call in all_search_calls:
528 if call.research_id:
529 if (
530 call.research_id
531 not in search_results_by_research_id
532 ):
533 search_results_by_research_id[
534 call.research_id
535 ] = 0
536 search_results_by_research_id[
537 call.research_id
538 ] += call.results_count or 0
540 logger.info(
541 f"Found search metrics for {len(search_results_by_research_id)} research IDs from {len(all_search_calls)} total search calls"
542 )
543 logger.debug(
544 f"Research IDs from results: {research_ids[:5] if len(research_ids) > 5 else research_ids}"
545 )
546 logger.debug(
547 f"Search results by research_id: {dict(list(search_results_by_research_id.items())[:5])}"
548 )
549 except Exception:
550 logger.exception(
551 f"Error getting search metrics for benchmark {benchmark_run_id}"
552 )
554 # Format results for UI display
555 formatted_results = []
556 for result in results:
557 # Get search result count using research_id
558 search_result_count = 0
560 try:
561 if (
562 result.research_id
563 and result.research_id in search_results_by_research_id
564 ):
565 search_result_count = search_results_by_research_id[
566 result.research_id
567 ]
568 logger.debug(
569 f"Found {search_result_count} search results for research_id {result.research_id}"
570 )
572 except Exception:
573 logger.exception(
574 f"Error getting search results for result {result.example_id}"
575 )
577 formatted_results.append(
578 {
579 "example_id": result.example_id,
580 "dataset_type": result.dataset_type.value,
581 "question": result.question,
582 "correct_answer": result.correct_answer,
583 "model_answer": result.extracted_answer,
584 "full_response": result.response,
585 "is_correct": result.is_correct,
586 "confidence": result.confidence,
587 "grader_response": result.grader_response,
588 "processing_time": result.processing_time,
589 "search_result_count": search_result_count,
590 "sources": result.sources,
591 "completed_at": result.completed_at.isoformat()
592 if result.completed_at
593 else None,
594 }
595 )
597 return jsonify({"success": True, "results": formatted_results})
599 except Exception:
600 logger.exception("Error getting benchmark results")
601 return jsonify(
602 {"success": False, "error": "An internal error has occurred."}
603 ), 500
606@benchmark_bp.route(
607 "/api/results/<int:benchmark_run_id>/export", methods=["GET"]
608)
609@login_required
610def export_benchmark_results(benchmark_run_id: int):
611 """Get lightweight results for YAML export (no full_response/sources/grader_response)."""
612 try:
613 from sqlalchemy.orm import load_only
615 from ...database.models.benchmark import BenchmarkResult
616 from ...database.session_context import get_user_db_session
617 from flask import session as flask_session
619 username = flask_session.get("username")
620 logger.info(
621 "Exporting benchmark results for run {} by user {}",
622 benchmark_run_id,
623 username,
624 )
625 with get_user_db_session(username) as session:
626 results = (
627 session.query(BenchmarkResult)
628 .options(
629 load_only(
630 BenchmarkResult.example_id,
631 BenchmarkResult.dataset_type,
632 BenchmarkResult.question,
633 BenchmarkResult.correct_answer,
634 BenchmarkResult.extracted_answer,
635 BenchmarkResult.is_correct,
636 BenchmarkResult.confidence,
637 BenchmarkResult.processing_time,
638 BenchmarkResult.completed_at,
639 )
640 )
641 .filter(BenchmarkResult.benchmark_run_id == benchmark_run_id)
642 .order_by(BenchmarkResult.id.asc())
643 .all()
644 )
646 formatted = []
647 for r in results:
648 formatted.append(
649 {
650 "example_id": r.example_id,
651 "dataset_type": r.dataset_type.value,
652 "question": r.question,
653 "correct_answer": r.correct_answer,
654 "model_answer": r.extracted_answer,
655 "is_correct": r.is_correct,
656 "confidence": r.confidence,
657 "processing_time": r.processing_time,
658 "completed_at": r.completed_at.isoformat()
659 if r.completed_at
660 else None,
661 }
662 )
664 logger.info(
665 "Exported {} results for benchmark run {}",
666 len(formatted),
667 benchmark_run_id,
668 )
669 return jsonify({"success": True, "results": formatted})
671 except Exception:
672 logger.exception("Error exporting benchmark results")
673 return jsonify(
674 {"success": False, "error": "An internal error has occurred."}
675 ), 500
678@benchmark_bp.route("/api/configs", methods=["GET"])
679@login_required
680def get_saved_configs():
681 """Get list of saved benchmark configurations."""
682 try:
683 # TODO: Implement saved configs retrieval from database
684 # For now return default configs
685 default_configs = [
686 {
687 "id": 1,
688 "name": "Quick Test",
689 "description": "Fast benchmark with minimal examples",
690 "search_config": {
691 "iterations": 3,
692 "questions_per_iteration": 3,
693 "search_tool": "searxng",
694 "search_strategy": "focused_iteration",
695 },
696 "datasets_config": {
697 "simpleqa": {"count": 10},
698 "browsecomp": {"count": 5},
699 },
700 },
701 {
702 "id": 2,
703 "name": "Standard Evaluation",
704 "description": "Comprehensive benchmark with standard settings",
705 "search_config": {
706 "iterations": 8,
707 "questions_per_iteration": 5,
708 "search_tool": "searxng",
709 "search_strategy": "focused_iteration",
710 },
711 "datasets_config": {
712 "simpleqa": {"count": 50},
713 "browsecomp": {"count": 25},
714 },
715 },
716 ]
718 return jsonify({"success": True, "configs": default_configs})
720 except Exception:
721 logger.exception("Error getting saved configs")
722 return jsonify(
723 {"success": False, "error": "An internal error has occurred."}
724 ), 500
727@benchmark_bp.route("/api/start-simple", methods=["POST"])
728@login_required
729def start_benchmark_simple():
730 """Start a benchmark using current database settings."""
731 try:
732 data = request.get_json()
733 datasets_config = data.get("datasets_config", {})
735 # Validate datasets
736 if not datasets_config or not any( 736 ↛ 746line 736 didn't jump to line 746 because the condition on line 736 was always true
737 config.get("count", 0) > 0 for config in datasets_config.values()
738 ):
739 return jsonify(
740 {
741 "error": "At least one dataset with count > 0 must be specified"
742 }
743 ), 400
745 # Get current settings from database
746 from flask import session as flask_session
748 username = flask_session.get("username")
749 session_id = flask_session.get("session_id")
751 # Try to get password from session store for background thread
752 from ...database.session_passwords import session_password_store
754 user_password = None
755 if session_id:
756 user_password = session_password_store.get_session_password(
757 username, session_id
758 )
760 with get_user_db_session(username, user_password) as session:
761 # For benchmarks, use a default test username
762 settings_manager = SettingsManager(session)
764 # Build search config from database settings
765 search_config = {
766 "iterations": int(
767 settings_manager.get_setting("search.iterations", 8)
768 ),
769 "questions_per_iteration": int(
770 settings_manager.get_setting(
771 "search.questions_per_iteration", 5
772 )
773 ),
774 "search_tool": settings_manager.get_setting(
775 "search.tool", "searxng"
776 ),
777 "search_strategy": settings_manager.get_setting(
778 "search.search_strategy", "focused_iteration"
779 ),
780 "model_name": settings_manager.get_setting("llm.model"),
781 "provider": settings_manager.get_setting("llm.provider"),
782 "temperature": float(
783 settings_manager.get_setting("llm.temperature", 0.7)
784 ),
785 "max_tokens": settings_manager.get_setting(
786 "llm.max_tokens", 30000
787 ),
788 "context_window_unrestricted": settings_manager.get_setting(
789 "llm.context_window_unrestricted", True
790 ),
791 "context_window_size": settings_manager.get_setting(
792 "llm.context_window_size", 128000
793 ),
794 "local_context_window_size": settings_manager.get_setting(
795 "llm.local_context_window_size", 4096
796 ),
797 }
799 # Add provider-specific settings
800 provider = search_config.get("provider")
801 if provider == "openai_endpoint":
802 search_config["openai_endpoint_url"] = (
803 settings_manager.get_setting("llm.openai_endpoint.url")
804 )
805 search_config["openai_endpoint_api_key"] = (
806 settings_manager.get_setting("llm.openai_endpoint.api_key")
807 )
808 elif provider == "openai":
809 search_config["openai_api_key"] = settings_manager.get_setting(
810 "llm.openai.api_key"
811 )
812 elif provider == "anthropic":
813 search_config["anthropic_api_key"] = (
814 settings_manager.get_setting("llm.anthropic.api_key")
815 )
817 # Read evaluation config from database settings
818 evaluation_provider = settings_manager.get_setting(
819 "benchmark.evaluation.provider", "openai_endpoint"
820 )
821 evaluation_model = settings_manager.get_setting(
822 "benchmark.evaluation.model", "anthropic/claude-3.7-sonnet"
823 )
824 evaluation_temperature = float(
825 settings_manager.get_setting(
826 "benchmark.evaluation.temperature", 0
827 )
828 )
830 evaluation_config = {
831 "provider": evaluation_provider,
832 "model_name": evaluation_model,
833 "temperature": evaluation_temperature,
834 }
836 # Add provider-specific settings for evaluation
837 if evaluation_provider == "openai_endpoint":
838 evaluation_config["openai_endpoint_url"] = (
839 settings_manager.get_setting(
840 "benchmark.evaluation.endpoint_url",
841 "https://openrouter.ai/api/v1",
842 )
843 )
844 evaluation_config["openai_endpoint_api_key"] = (
845 settings_manager.get_setting("llm.openai_endpoint.api_key")
846 )
847 elif evaluation_provider == "openai":
848 evaluation_config["openai_api_key"] = (
849 settings_manager.get_setting("llm.openai.api_key")
850 )
851 elif evaluation_provider == "anthropic":
852 evaluation_config["anthropic_api_key"] = (
853 settings_manager.get_setting("llm.anthropic.api_key")
854 )
856 # Create and start benchmark
857 benchmark_run_id = benchmark_service.create_benchmark_run(
858 run_name=f"Quick Benchmark - {data.get('run_name', '')}",
859 search_config=search_config,
860 evaluation_config=evaluation_config,
861 datasets_config=datasets_config,
862 username=username,
863 user_password=user_password,
864 )
866 success = benchmark_service.start_benchmark(
867 benchmark_run_id, username, user_password
868 )
870 if success:
871 return jsonify(
872 {
873 "success": True,
874 "benchmark_run_id": benchmark_run_id,
875 "message": "Benchmark started with current settings",
876 }
877 )
878 else:
879 return jsonify(
880 {"success": False, "error": "Failed to start benchmark"}
881 ), 500
883 except Exception:
884 logger.exception("Error starting simple benchmark")
885 return jsonify(
886 {"success": False, "error": "An internal error has occurred."}
887 ), 500
890@benchmark_bp.route("/api/validate-config", methods=["POST"])
891@login_required
892def validate_config():
893 """Validate a benchmark configuration."""
894 try:
895 data = request.get_json()
897 if not data: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true
898 return jsonify({"valid": False, "errors": ["No data provided"]})
900 errors = []
902 # Validate search config
903 search_config = data.get("search_config", {})
904 if not search_config.get("search_tool"): 904 ↛ 906line 904 didn't jump to line 906 because the condition on line 904 was always true
905 errors.append("Search tool is required")
906 if not search_config.get("search_strategy"): 906 ↛ 910line 906 didn't jump to line 910 because the condition on line 906 was always true
907 errors.append("Search strategy is required")
909 # Validate datasets config
910 datasets_config = data.get("datasets_config", {})
911 if not datasets_config: 911 ↛ 914line 911 didn't jump to line 914 because the condition on line 911 was always true
912 errors.append("At least one dataset must be configured")
914 total_examples = sum(
915 config.get("count", 0) for config in datasets_config.values()
916 )
917 if total_examples == 0: 917 ↛ 920line 917 didn't jump to line 920 because the condition on line 917 was always true
918 errors.append("Total examples must be greater than 0")
920 if total_examples > 1000: 920 ↛ 921line 920 didn't jump to line 921 because the condition on line 920 was never true
921 errors.append(
922 "Total examples should not exceed 1000 for web interface"
923 )
925 return jsonify(
926 {
927 "valid": len(errors) == 0,
928 "errors": errors,
929 "total_examples": total_examples,
930 }
931 )
933 except Exception:
934 logger.exception("Error validating config")
935 return jsonify(
936 {"valid": False, "errors": ["An internal error has occurred."]}
937 ), 500
940@benchmark_bp.route("/api/search-quality", methods=["GET"])
941@limiter.exempt
942@login_required
943def get_search_quality():
944 """Get current search quality metrics from rate limiting tracker."""
945 try:
946 from ...web_search_engines.rate_limiting import get_tracker
948 tracker = get_tracker()
949 quality_stats = tracker.get_search_quality_stats()
951 return jsonify(
952 {
953 "success": True,
954 "search_quality": quality_stats,
955 "timestamp": time.time(),
956 }
957 )
959 except Exception:
960 logger.exception("Error getting search quality")
961 return jsonify(
962 {"success": False, "error": "An internal error has occurred."}
963 ), 500
966@benchmark_bp.route("/api/delete/<int:benchmark_run_id>", methods=["DELETE"])
967@login_required
968def delete_benchmark_run(benchmark_run_id: int):
969 """Delete a benchmark run and all its results."""
970 try:
971 from ...database.models.benchmark import (
972 BenchmarkProgress,
973 BenchmarkResult,
974 BenchmarkRun,
975 )
976 from ...database.session_context import get_user_db_session
977 from flask import session as flask_session
979 username = flask_session.get("username")
980 with get_user_db_session(username) as session:
981 # Check if benchmark run exists
982 benchmark_run = (
983 session.query(BenchmarkRun)
984 .filter(BenchmarkRun.id == benchmark_run_id)
985 .first()
986 )
988 if not benchmark_run:
989 return jsonify(
990 {"success": False, "error": "Benchmark run not found"}
991 ), 404
993 # Prevent deletion of running benchmarks
994 if benchmark_run.status.value == "in_progress":
995 return jsonify(
996 {
997 "success": False,
998 "error": "Cannot delete a running benchmark. Cancel it first.",
999 }
1000 ), 400
1002 # Delete related records (cascade should handle this, but being explicit)
1003 session.query(BenchmarkResult).filter(
1004 BenchmarkResult.benchmark_run_id == benchmark_run_id
1005 ).delete()
1007 session.query(BenchmarkProgress).filter(
1008 BenchmarkProgress.benchmark_run_id == benchmark_run_id
1009 ).delete()
1011 # Delete the benchmark run
1012 session.delete(benchmark_run)
1013 session.commit()
1015 logger.info(f"Deleted benchmark run {benchmark_run_id}")
1016 return jsonify(
1017 {
1018 "success": True,
1019 "message": f"Benchmark run {benchmark_run_id} deleted successfully",
1020 }
1021 )
1023 except Exception:
1024 logger.exception(f"Error deleting benchmark run {benchmark_run_id}")
1025 return jsonify(
1026 {"success": False, "error": "An internal error has occurred."}
1027 ), 500