Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 89%
304 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Configuration comparison for Local Deep Research.
4This module provides functions for comparing different parameter configurations
5and evaluating their performance across various metrics.
6"""
8import os
9from datetime import datetime, UTC
10from pathlib import Path
11from typing import Any, Dict, List, Optional
13import matplotlib.pyplot as plt
14import numpy as np
15from loguru import logger
16from matplotlib.patches import Circle, RegularPolygon
18from local_deep_research.benchmarks.efficiency.resource_monitor import (
19 ResourceMonitor,
20)
21from local_deep_research.benchmarks.efficiency.speed_profiler import (
22 SpeedProfiler,
23)
24from local_deep_research.benchmarks.optimization.metrics import (
25 calculate_combined_score,
26 calculate_quality_metrics,
27 calculate_resource_metrics,
28 calculate_speed_metrics,
29)
30from local_deep_research.config.llm_config import get_llm
31from local_deep_research.config.search_config import get_search
32from local_deep_research.search_system import AdvancedSearchSystem
35def compare_configurations(
36 query: str,
37 configurations: List[Dict[str, Any]],
38 output_dir: str = "comparison_results",
39 model_name: Optional[str] = None,
40 provider: Optional[str] = None,
41 search_tool: Optional[str] = None,
42 repetitions: int = 1,
43 metric_weights: Optional[Dict[str, float]] = None,
44) -> Dict[str, Any]:
45 """
46 Compare multiple parameter configurations.
48 Args:
49 query: Research query to use for evaluation
50 configurations: List of parameter configurations to compare
51 output_dir: Directory to save comparison results
52 model_name: Name of the LLM model to use
53 provider: LLM provider
54 search_tool: Search engine to use
55 repetitions: Number of repetitions for each configuration
56 metric_weights: Dictionary of weights for each metric type
58 Returns:
59 Dictionary with comparison results
60 """
61 os.makedirs(output_dir, exist_ok=True)
63 # Default metric weights if not provided
64 if metric_weights is None:
65 metric_weights = {
66 "quality": 0.6,
67 "speed": 0.4,
68 "resource": 0.0, # Disabled by default
69 }
71 # Verify valid configurations
72 if not configurations:
73 logger.error("No configurations provided for comparison")
74 return {"error": "No configurations provided"}
76 # Results storage
77 results = []
79 # Process each configuration
80 for i, config in enumerate(configurations):
81 logger.info(
82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}"
83 )
85 # Name for this configuration
86 config_name = config.get("name", f"Configuration {i + 1}")
88 # Results for all repetitions of this configuration
89 config_results = []
91 # Run multiple repetitions
92 for rep in range(repetitions):
93 logger.info(
94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}"
95 )
97 try:
98 # Run the configuration
99 result = _evaluate_single_configuration(
100 query=query,
101 config=config,
102 model_name=model_name,
103 provider=provider,
104 search_tool=search_tool,
105 )
107 config_results.append(result)
108 logger.info(f"Completed repetition {rep + 1} for {config_name}")
110 except Exception as e:
111 logger.exception(
112 f"Error in {config_name}, repetition {rep + 1}"
113 )
114 # Add error info but continue with other configurations
115 config_results.append({"error": str(e), "success": False})
117 # Calculate aggregate metrics across repetitions
118 if config_results: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true
119 # Filter out failed runs
120 successful_runs = [
121 r for r in config_results if r.get("success", False)
122 ]
124 if successful_runs:
125 # Calculate average metrics
126 avg_metrics = _calculate_average_metrics(successful_runs)
128 # Calculate overall score
129 overall_score = calculate_combined_score(
130 metrics={
131 "quality": avg_metrics.get("quality_metrics", {}),
132 "speed": avg_metrics.get("speed_metrics", {}),
133 "resource": avg_metrics.get("resource_metrics", {}),
134 },
135 weights=metric_weights,
136 )
138 result_summary = {
139 "name": config_name,
140 "configuration": config,
141 "success": True,
142 "runs_completed": len(successful_runs),
143 "runs_failed": len(config_results) - len(successful_runs),
144 "avg_metrics": avg_metrics,
145 "overall_score": overall_score,
146 "individual_results": config_results,
147 }
148 else:
149 # All runs failed
150 result_summary = {
151 "name": config_name,
152 "configuration": config,
153 "success": False,
154 "runs_completed": 0,
155 "runs_failed": len(config_results),
156 "error": "All runs failed",
157 "individual_results": config_results,
158 }
160 results.append(result_summary)
162 # Sort results by overall score (if available)
163 sorted_results = sorted(
164 [r for r in results if r.get("success", False)],
165 key=lambda x: x.get("overall_score", 0),
166 reverse=True,
167 )
169 # Add failed configurations at the end
170 sorted_results.extend([r for r in results if not r.get("success", False)])
172 # Create comparison report
173 comparison_report = {
174 "query": query,
175 "configurations_tested": len(configurations),
176 "successful_configurations": len(
177 [r for r in results if r.get("success", False)]
178 ),
179 "failed_configurations": len(
180 [r for r in results if not r.get("success", False)]
181 ),
182 "repetitions": repetitions,
183 "metric_weights": metric_weights,
184 "timestamp": datetime.now(UTC).isoformat(),
185 "results": sorted_results,
186 }
188 # Save results to file
189 from ...security.file_write_verifier import write_json_verified
191 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
192 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json")
194 write_json_verified(
195 result_file,
196 comparison_report,
197 "benchmark.allow_file_output",
198 context="comparison results",
199 )
201 # Generate visualizations
202 _viz_dir_path = Path(output_dir) / "visualizations"
203 _viz_dir_path.mkdir(parents=True, exist_ok=True)
204 visualizations_dir = str(_viz_dir_path)
206 _create_comparison_visualizations(
207 comparison_report, output_dir=visualizations_dir, timestamp=timestamp
208 )
210 logger.info(f"Comparison completed. Results saved to {result_file}")
212 # Add report path to the result
213 comparison_report["report_path"] = result_file
215 return comparison_report
218def _evaluate_single_configuration(
219 query: str,
220 config: Dict[str, Any],
221 model_name: Optional[str] = None,
222 provider: Optional[str] = None,
223 search_tool: Optional[str] = None,
224) -> Dict[str, Any]:
225 """
226 Evaluate a single configuration.
228 Args:
229 query: Research query to evaluate
230 config: Configuration parameters
231 model_name: Name of the LLM model to use
232 provider: LLM provider
233 search_tool: Search engine to use
235 Returns:
236 Dictionary with evaluation results
237 """
238 # Extract configuration parameters
239 config_model_name = config.get("model_name", model_name)
240 config_provider = config.get("provider", provider)
241 config_search_tool = config.get("search_tool", search_tool)
242 config_iterations = config.get("iterations", 2)
243 config_questions_per_iteration = config.get("questions_per_iteration", 2)
244 config_search_strategy = config.get("search_strategy", "iterdrag")
246 # Initialize profiling tools
247 speed_profiler = SpeedProfiler()
248 resource_monitor = ResourceMonitor(sampling_interval=0.5)
250 # Start profiling
251 speed_profiler.start()
252 resource_monitor.start()
254 llm = None
255 search = None
256 system = None
257 try:
258 # Get LLM
259 with speed_profiler.timer("llm_initialization"):
260 llm = get_llm(
261 temperature=config.get("temperature", 0.7),
262 model_name=config_model_name,
263 provider=config_provider,
264 )
266 # Set up search engine if specified
267 with speed_profiler.timer("search_initialization"):
268 search = None
269 if config_search_tool: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 search = get_search(
271 config_search_tool,
272 llm_instance=llm,
273 )
275 # Create search system
276 system = AdvancedSearchSystem( # type: ignore[call-arg]
277 llm=llm,
278 search=search, # type: ignore[arg-type]
279 max_iterations=config_iterations,
280 questions_per_iteration=config_questions_per_iteration,
281 strategy_name=config_search_strategy,
282 )
284 # Run the analysis
285 with speed_profiler.timer("analysis"):
286 results = system.analyze_topic(query)
288 # Stop profiling
289 speed_profiler.stop()
290 resource_monitor.stop()
292 # Calculate metrics
293 _system_config = dict(config)
294 quality_metrics = calculate_quality_metrics(
295 system_config=_system_config
296 )
298 speed_metrics = calculate_speed_metrics(system_config=_system_config)
300 resource_metrics = calculate_resource_metrics(
301 system_config=_system_config
302 )
304 # Return comprehensive results
305 return {
306 "query": query,
307 "config": config,
308 "success": True,
309 "findings_count": len(results.get("findings", [])),
310 "knowledge_length": len(results.get("current_knowledge", "")),
311 "quality_metrics": quality_metrics,
312 "speed_metrics": speed_metrics,
313 "resource_metrics": resource_metrics,
314 "timing_details": speed_profiler.get_timings(),
315 "resource_details": resource_monitor.get_combined_stats(),
316 }
318 except Exception as e:
319 # Stop profiling on error
320 speed_profiler.stop()
321 resource_monitor.stop()
323 # Log the error
324 logger.exception("Error evaluating configuration")
326 # Return error information
327 return {
328 "query": query,
329 "config": config,
330 "success": False,
331 "error": str(e),
332 "timing_details": speed_profiler.get_timings(),
333 "resource_details": resource_monitor.get_combined_stats(),
334 }
335 finally:
336 from ...utilities.resource_utils import safe_close
338 safe_close(system, "evaluator system")
339 safe_close(search, "evaluator search engine")
340 safe_close(llm, "evaluator LLM")
343def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
344 """
345 Calculate average metrics across multiple runs.
347 Args:
348 results: List of individual run results
350 Returns:
351 Dictionary with averaged metrics
352 """
353 # Check if there are any successful results
354 if not results:
355 return {}
357 # Initialize average metrics
358 avg_metrics: Dict[str, Any] = {
359 "quality_metrics": {},
360 "speed_metrics": {},
361 "resource_metrics": {},
362 }
364 # Quality metrics
365 quality_keys = set()
366 for result in results:
367 quality_metrics = result.get("quality_metrics", {})
368 quality_keys.update(quality_metrics.keys())
370 for key in quality_keys:
371 values = [r.get("quality_metrics", {}).get(key) for r in results]
372 values = [v for v in values if v is not None]
373 if values: 373 ↛ 370line 373 didn't jump to line 370 because the condition on line 373 was always true
374 avg_metrics["quality_metrics"][key] = sum(values) / len(values)
376 # Speed metrics
377 speed_keys = set()
378 for result in results:
379 speed_metrics = result.get("speed_metrics", {})
380 speed_keys.update(speed_metrics.keys())
382 for key in speed_keys:
383 values = [r.get("speed_metrics", {}).get(key) for r in results]
384 values = [v for v in values if v is not None]
385 if values: 385 ↛ 382line 385 didn't jump to line 382 because the condition on line 385 was always true
386 avg_metrics["speed_metrics"][key] = sum(values) / len(values)
388 # Resource metrics
389 resource_keys = set()
390 for result in results:
391 resource_metrics = result.get("resource_metrics", {})
392 resource_keys.update(resource_metrics.keys())
394 for key in resource_keys:
395 values = [r.get("resource_metrics", {}).get(key) for r in results]
396 values = [v for v in values if v is not None]
397 if values: 397 ↛ 394line 397 didn't jump to line 394 because the condition on line 397 was always true
398 avg_metrics["resource_metrics"][key] = sum(values) / len(values)
400 return avg_metrics
403def _create_comparison_visualizations(
404 comparison_report: Dict[str, Any], output_dir: str, timestamp: str
405):
406 """
407 Create visualizations for the comparison results.
409 Args:
410 comparison_report: Comparison report dictionary
411 output_dir: Directory to save visualizations
412 timestamp: Timestamp string for filenames
413 """
414 # Check if there are successful results
415 successful_results = [
416 r
417 for r in comparison_report.get("results", [])
418 if r.get("success", False)
419 ]
421 if not successful_results:
422 logger.warning("No successful configurations to visualize")
423 return
425 # Extract configuration names
426 config_names = [
427 r.get("name", f"Config {i + 1}")
428 for i, r in enumerate(successful_results)
429 ]
431 # 1. Overall score comparison
432 plt.figure(figsize=(12, 6))
433 scores = [r.get("overall_score", 0) for r in successful_results]
435 # Create horizontal bar chart
436 plt.barh(config_names, scores, color="skyblue")
437 plt.xlabel("Overall Score")
438 plt.ylabel("Configuration")
439 plt.title("Configuration Performance Comparison")
440 plt.grid(axis="x", linestyle="--", alpha=0.7)
441 plt.tight_layout()
442 plt.savefig(
443 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png")
444 )
445 plt.close()
447 # 2. Quality metrics comparison
448 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"]
449 _create_metric_comparison_chart(
450 successful_results,
451 config_names,
452 quality_metrics,
453 "quality_metrics",
454 "Quality Metrics Comparison",
455 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"),
456 )
458 # 3. Speed metrics comparison
459 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"]
460 _create_metric_comparison_chart(
461 successful_results,
462 config_names,
463 speed_metrics,
464 "speed_metrics",
465 "Speed Metrics Comparison",
466 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"),
467 )
469 # 4. Resource metrics comparison
470 resource_metrics = [
471 "overall_resource",
472 "process_memory_max_mb",
473 "system_cpu_avg",
474 ]
475 _create_metric_comparison_chart(
476 successful_results,
477 config_names,
478 resource_metrics,
479 "resource_metrics",
480 "Resource Usage Comparison",
481 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"),
482 )
484 # 5. Spider chart for multi-dimensional comparison
485 _create_spider_chart(
486 successful_results,
487 config_names,
488 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"),
489 )
491 # 6. Pareto frontier chart for quality vs. speed
492 _create_pareto_chart(
493 successful_results,
494 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"),
495 )
498def _create_metric_comparison_chart(
499 results: List[Dict[str, Any]],
500 config_names: List[str],
501 metric_keys: List[str],
502 metric_category: str,
503 title: str,
504 output_path: str,
505):
506 """
507 Create a chart comparing specific metrics across configurations.
509 Args:
510 results: List of configuration results
511 config_names: Names of configurations
512 metric_keys: Keys of metrics to compare
513 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.)
514 title: Chart title
515 output_path: Path to save the chart
516 """
517 # Create figure with multiple subplots (one per metric)
518 fig, axes = plt.subplots(
519 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys))
520 )
522 # Handle case with only one metric
523 if len(metric_keys) == 1:
524 axes = [axes]
526 for i, metric_key in enumerate(metric_keys):
527 ax = axes[i]
529 # Get metric values
530 metric_values = []
531 for result in results:
532 metrics = result.get("avg_metrics", {}).get(metric_category, {})
533 value = metrics.get(metric_key)
535 # Handle time values for better visualization
536 if "duration" in metric_key and value is not None:
537 # Convert to seconds if > 60 seconds, minutes if > 60 minutes
538 if value > 3600:
539 value = value / 3600 # Convert to hours
540 metric_key += " (hours)"
541 elif value > 60:
542 value = value / 60 # Convert to minutes
543 metric_key += " (minutes)"
544 else:
545 metric_key += " (seconds)"
547 metric_values.append(value if value is not None else 0)
549 # Create horizontal bar chart
550 bars = ax.barh(config_names, metric_values, color="lightblue")
551 ax.set_xlabel(metric_key.replace("_", " ").title())
552 ax.set_title(f"{metric_key.replace('_', ' ').title()}")
553 ax.grid(axis="x", linestyle="--", alpha=0.7)
555 # Add value labels to bars
556 for bar in bars:
557 width = bar.get_width()
558 label_x_pos = width * 1.01
559 ax.text(
560 label_x_pos,
561 bar.get_y() + bar.get_height() / 2,
562 f"{width:.2f}",
563 va="center",
564 )
566 plt.suptitle(title, fontsize=16)
567 plt.tight_layout()
568 plt.savefig(output_path)
569 plt.close()
572def _create_spider_chart(
573 results: List[Dict[str, Any]], config_names: List[str], output_path: str
574):
575 """
576 Create a spider chart comparing metrics across configurations.
578 Args:
579 results: List of configuration results
580 config_names: Names of configurations
581 output_path: Path to save the chart
582 """
583 # Try to import the radar chart module
584 try:
585 from matplotlib.path import Path
586 from matplotlib.projections import register_projection
587 from matplotlib.projections.polar import PolarAxes
588 from matplotlib.spines import Spine
590 def radar_factory(num_vars, frame="circle"):
591 """Create a radar chart with `num_vars` axes."""
592 # Calculate evenly-spaced axis angles
593 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
595 class RadarAxes(PolarAxes):
596 name = "radar"
598 def __init__(self, *args, **kwargs):
599 super().__init__(*args, **kwargs)
600 self.set_theta_zero_location("N")
602 def fill(self, *args, closed=True, **kwargs):
603 return super().fill(closed=closed, *args, **kwargs)
605 def plot(self, *args, **kwargs):
606 return super().plot(*args, **kwargs)
608 def set_varlabels(self, labels):
609 self.set_thetagrids(np.degrees(theta), labels)
611 def _gen_axes_patch(self):
612 if frame == "circle":
613 return Circle((0.5, 0.5), 0.5)
614 if frame == "polygon":
615 return RegularPolygon(
616 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
617 )
618 raise ValueError("Unknown value for 'frame': %s" % frame) # noqa: TRY301 — inside nested method definition, not caught by enclosing try
620 def _gen_axes_spines(self): # type: ignore[misc]
621 if frame == "circle":
622 return super()._gen_axes_spines()
623 if frame == "polygon":
624 spine_type = Spine.circular_spine
625 verts = unit_poly_verts(num_vars)
626 vertices = [(0.5, 0.5)] + verts
627 codes = (
628 [Path.MOVETO]
629 + [Path.LINETO] * num_vars
630 + [Path.CLOSEPOLY]
631 )
632 path = Path(vertices, codes)
633 spine = Spine(self, spine_type, path) # type: ignore[arg-type]
634 spine.set_transform(self.transAxes)
635 return {"polar": spine}
636 raise ValueError("Unknown value for 'frame': %s" % frame) # noqa: TRY301 — inside nested method definition, not caught by enclosing try
638 def unit_poly_verts(num_vars):
639 """Return vertices of polygon for radar chart."""
640 verts = []
641 for i in range(num_vars):
642 angle = theta[i]
643 verts.append(
644 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle)))
645 )
646 return verts
648 register_projection(RadarAxes)
649 return theta
651 # Select metrics for the spider chart
652 metrics = [
653 {"name": "Quality", "key": "quality_metrics.overall_quality"},
654 {"name": "Speed", "key": "speed_metrics.overall_speed"},
655 {
656 "name": "Sources",
657 "key": "quality_metrics.normalized_source_count",
658 },
659 {
660 "name": "Content",
661 "key": "quality_metrics.normalized_knowledge_length",
662 },
663 {
664 "name": "Memory",
665 "key": "resource_metrics.normalized_memory_usage",
666 "invert": True,
667 },
668 ]
670 # Extract metric values
671 spoke_labels = [m["name"] for m in metrics]
672 num_vars = len(spoke_labels)
673 theta = radar_factory(num_vars)
675 fig, ax = plt.subplots(
676 figsize=(10, 10), subplot_kw={"projection": "radar"}
677 )
679 # Color map for different configurations
680 colors = plt.cm.viridis(np.linspace(0, 1, len(results))) # type: ignore[attr-defined]
682 for i, result in enumerate(results):
683 values = []
684 for metric in metrics:
685 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality")
686 key_parts = metric["key"].split(".")
687 value: Any = result.get("avg_metrics", {})
688 for part in key_parts:
689 value = value.get(part, 0) if isinstance(value, dict) else 0
691 # Invert if needed (for metrics where lower is better)
692 if metric.get("invert", False):
693 value = 1.0 - value
695 values.append(value)
697 # Plot this configuration
698 ax.plot(
699 theta,
700 values,
701 color=colors[i],
702 linewidth=2,
703 label=config_names[i],
704 )
705 ax.fill(theta, values, color=colors[i], alpha=0.25)
707 # Set chart properties
708 ax.set_varlabels(spoke_labels) # type: ignore[attr-defined]
709 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1))
710 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05)
711 plt.tight_layout()
713 # Save chart
714 plt.savefig(output_path)
715 plt.close()
717 except Exception as e:
718 logger.exception("Error creating spider chart")
719 # Create a text-based chart as fallback
720 plt.figure(figsize=(10, 6))
721 plt.text(
722 0.5,
723 0.5,
724 f"Spider chart could not be created: {e!s}",
725 horizontalalignment="center",
726 verticalalignment="center",
727 )
728 plt.axis("off")
729 plt.savefig(output_path)
730 plt.close()
733def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str):
734 """
735 Create a Pareto frontier chart showing quality vs. speed tradeoff.
737 Args:
738 results: List of configuration results
739 output_path: Path to save the chart
740 """
741 # Extract quality and speed metrics
742 quality_scores = []
743 speed_scores = []
744 names = []
746 for result in results:
747 metrics = result.get("avg_metrics", {})
748 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0)
750 # For speed, we use inverse of duration (so higher is better)
751 duration = metrics.get("speed_metrics", {}).get("total_duration", 1)
752 speed = 1.0 / max(duration, 0.001) # Avoid division by zero
754 quality_scores.append(quality)
755 speed_scores.append(speed)
756 names.append(result.get("name", "Configuration"))
758 # Create scatter plot
759 plt.figure(figsize=(10, 8))
760 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7)
762 # Add labels for each point
763 for i, name in enumerate(names):
764 plt.annotate(
765 name,
766 (quality_scores[i], speed_scores[i]),
767 xytext=(5, 5),
768 textcoords="offset points",
769 )
771 # Identify Pareto frontier
772 pareto_points = []
773 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)):
774 is_pareto = True
775 for q2, s2 in zip(quality_scores, speed_scores, strict=False):
776 if q2 > q and s2 > s: # Dominated
777 is_pareto = False
778 break
779 if is_pareto:
780 pareto_points.append(i)
782 # Highlight Pareto frontier
783 pareto_quality = [quality_scores[i] for i in pareto_points]
784 pareto_speed = [speed_scores[i] for i in pareto_points]
786 # Sort pareto points for line drawing
787 pareto_sorted = sorted(
788 zip(pareto_quality, pareto_speed, pareto_points, strict=False)
789 )
790 pareto_quality = [p[0] for p in pareto_sorted]
791 pareto_speed = [p[1] for p in pareto_sorted]
792 pareto_indices = [p[2] for p in pareto_sorted]
794 # Draw Pareto frontier line
795 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2)
797 # Highlight Pareto optimal points
798 plt.scatter(
799 [quality_scores[i] for i in pareto_indices],
800 [speed_scores[i] for i in pareto_indices],
801 s=150,
802 facecolors="none",
803 edgecolors="r",
804 linewidth=2,
805 )
807 # Add labels for Pareto optimal configurations
808 for i in pareto_indices:
809 plt.annotate(
810 names[i],
811 (quality_scores[i], speed_scores[i]),
812 xytext=(8, 8),
813 textcoords="offset points",
814 bbox={"boxstyle": "round,pad=0.5", "fc": "yellow", "alpha": 0.7},
815 )
817 # Set chart properties
818 plt.xlabel("Quality Score (higher is better)")
819 plt.ylabel("Speed Score (higher is better)")
820 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14)
821 plt.grid(True, linestyle="--", alpha=0.7)
823 # Add explanation
824 plt.figtext(
825 0.5,
826 0.01,
827 "Points on the red line are Pareto optimal configurations\n"
828 "(no other configuration is better in both quality and speed)",
829 ha="center",
830 fontsize=10,
831 bbox={"boxstyle": "round", "fc": "white", "alpha": 0.7},
832 )
834 plt.tight_layout()
835 plt.savefig(output_path)
836 plt.close()