Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 84%
301 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Configuration comparison for Local Deep Research.
4This module provides functions for comparing different parameter configurations
5and evaluating their performance across various metrics.
6"""
8import os
9from datetime import datetime, UTC
10from pathlib import Path
11from typing import Any, Dict, List, Optional
13import matplotlib.pyplot as plt
14import numpy as np
15from loguru import logger
16from matplotlib.patches import Circle, RegularPolygon
18from local_deep_research.benchmarks.efficiency.resource_monitor import (
19 ResourceMonitor,
20)
21from local_deep_research.benchmarks.efficiency.speed_profiler import (
22 SpeedProfiler,
23)
24from local_deep_research.benchmarks.optimization.metrics import (
25 calculate_combined_score,
26 calculate_quality_metrics,
27 calculate_resource_metrics,
28 calculate_speed_metrics,
29)
30from local_deep_research.config.llm_config import get_llm
31from local_deep_research.config.search_config import get_search
32from local_deep_research.search_system import AdvancedSearchSystem
35def compare_configurations(
36 query: str,
37 configurations: List[Dict[str, Any]],
38 output_dir: str = "comparison_results",
39 model_name: Optional[str] = None,
40 provider: Optional[str] = None,
41 search_tool: Optional[str] = None,
42 repetitions: int = 1,
43 metric_weights: Optional[Dict[str, float]] = None,
44) -> Dict[str, Any]:
45 """
46 Compare multiple parameter configurations.
48 Args:
49 query: Research query to use for evaluation
50 configurations: List of parameter configurations to compare
51 output_dir: Directory to save comparison results
52 model_name: Name of the LLM model to use
53 provider: LLM provider
54 search_tool: Search engine to use
55 repetitions: Number of repetitions for each configuration
56 metric_weights: Dictionary of weights for each metric type
58 Returns:
59 Dictionary with comparison results
60 """
61 os.makedirs(output_dir, exist_ok=True)
63 # Default metric weights if not provided
64 if metric_weights is None:
65 metric_weights = {
66 "quality": 0.6,
67 "speed": 0.4,
68 "resource": 0.0, # Disabled by default
69 }
71 # Verify valid configurations
72 if not configurations:
73 logger.error("No configurations provided for comparison")
74 return {"error": "No configurations provided"}
76 # Results storage
77 results = []
79 # Process each configuration
80 for i, config in enumerate(configurations):
81 logger.info(
82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}"
83 )
85 # Name for this configuration
86 config_name = config.get("name", f"Configuration {i + 1}")
88 # Results for all repetitions of this configuration
89 config_results = []
91 # Run multiple repetitions
92 for rep in range(repetitions):
93 logger.info(
94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}"
95 )
97 try:
98 # Run the configuration
99 result = _evaluate_single_configuration(
100 query=query,
101 config=config,
102 model_name=model_name,
103 provider=provider,
104 search_tool=search_tool,
105 )
107 config_results.append(result)
108 logger.info(f"Completed repetition {rep + 1} for {config_name}")
110 except Exception as e:
111 logger.exception(
112 f"Error in {config_name}, repetition {rep + 1}"
113 )
114 # Add error info but continue with other configurations
115 config_results.append({"error": str(e), "success": False})
117 # Calculate aggregate metrics across repetitions
118 if config_results: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true
119 # Filter out failed runs
120 successful_runs = [
121 r for r in config_results if r.get("success", False)
122 ]
124 if successful_runs:
125 # Calculate average metrics
126 avg_metrics = _calculate_average_metrics(successful_runs)
128 # Calculate overall score
129 overall_score = calculate_combined_score(
130 metrics={
131 "quality": avg_metrics.get("quality_metrics", {}),
132 "speed": avg_metrics.get("speed_metrics", {}),
133 "resource": avg_metrics.get("resource_metrics", {}),
134 },
135 weights=metric_weights,
136 )
138 result_summary = {
139 "name": config_name,
140 "configuration": config,
141 "success": True,
142 "runs_completed": len(successful_runs),
143 "runs_failed": len(config_results) - len(successful_runs),
144 "avg_metrics": avg_metrics,
145 "overall_score": overall_score,
146 "individual_results": config_results,
147 }
148 else:
149 # All runs failed
150 result_summary = {
151 "name": config_name,
152 "configuration": config,
153 "success": False,
154 "runs_completed": 0,
155 "runs_failed": len(config_results),
156 "error": "All runs failed",
157 "individual_results": config_results,
158 }
160 results.append(result_summary)
162 # Sort results by overall score (if available)
163 sorted_results = sorted(
164 [r for r in results if r.get("success", False)],
165 key=lambda x: x.get("overall_score", 0),
166 reverse=True,
167 )
169 # Add failed configurations at the end
170 sorted_results.extend([r for r in results if not r.get("success", False)])
172 # Create comparison report
173 comparison_report = {
174 "query": query,
175 "configurations_tested": len(configurations),
176 "successful_configurations": len(
177 [r for r in results if r.get("success", False)]
178 ),
179 "failed_configurations": len(
180 [r for r in results if not r.get("success", False)]
181 ),
182 "repetitions": repetitions,
183 "metric_weights": metric_weights,
184 "timestamp": datetime.now(UTC).isoformat(),
185 "results": sorted_results,
186 }
188 # Save results to file
189 from ...security.file_write_verifier import write_json_verified
191 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
192 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json")
194 write_json_verified(
195 result_file,
196 comparison_report,
197 "benchmark.allow_file_output",
198 context="comparison results",
199 )
201 # Generate visualizations
202 visualizations_dir = Path(output_dir) / "visualizations"
203 visualizations_dir.mkdir(parents=True, exist_ok=True)
204 visualizations_dir = str(visualizations_dir)
206 _create_comparison_visualizations(
207 comparison_report, output_dir=visualizations_dir, timestamp=timestamp
208 )
210 logger.info(f"Comparison completed. Results saved to {result_file}")
212 # Add report path to the result
213 comparison_report["report_path"] = result_file
215 return comparison_report
218def _evaluate_single_configuration(
219 query: str,
220 config: Dict[str, Any],
221 model_name: Optional[str] = None,
222 provider: Optional[str] = None,
223 search_tool: Optional[str] = None,
224) -> Dict[str, Any]:
225 """
226 Evaluate a single configuration.
228 Args:
229 query: Research query to evaluate
230 config: Configuration parameters
231 model_name: Name of the LLM model to use
232 provider: LLM provider
233 search_tool: Search engine to use
235 Returns:
236 Dictionary with evaluation results
237 """
238 # Extract configuration parameters
239 config_model_name = config.get("model_name", model_name)
240 config_provider = config.get("provider", provider)
241 config_search_tool = config.get("search_tool", search_tool)
242 config_iterations = config.get("iterations", 2)
243 config_questions_per_iteration = config.get("questions_per_iteration", 2)
244 config_search_strategy = config.get("search_strategy", "iterdrag")
245 config_max_results = config.get("max_results", 50)
246 config_max_filtered_results = config.get("max_filtered_results", 20)
248 # Initialize profiling tools
249 speed_profiler = SpeedProfiler()
250 resource_monitor = ResourceMonitor(sampling_interval=0.5)
252 # Start profiling
253 speed_profiler.start()
254 resource_monitor.start()
256 try:
257 # Get LLM
258 with speed_profiler.timer("llm_initialization"):
259 llm = get_llm(
260 temperature=config.get("temperature", 0.7),
261 model_name=config_model_name,
262 provider=config_provider,
263 )
265 # Set up search engine if specified
266 with speed_profiler.timer("search_initialization"):
267 search = None
268 if config_search_tool: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true
269 search = get_search(
270 config_search_tool,
271 llm_instance=llm,
272 max_results=config_max_results,
273 max_filtered_results=config_max_filtered_results,
274 )
276 # Create search system
277 system = AdvancedSearchSystem(llm=llm, search=search)
278 system.max_iterations = config_iterations
279 system.questions_per_iteration = config_questions_per_iteration
280 system.strategy_name = config_search_strategy
282 # Run the analysis
283 with speed_profiler.timer("analysis"):
284 results = system.analyze_topic(query)
286 # Stop profiling
287 speed_profiler.stop()
288 resource_monitor.stop()
290 # Calculate metrics
291 quality_metrics = calculate_quality_metrics(
292 results=results,
293 system_info={
294 "all_links_of_system": getattr(
295 system, "all_links_of_system", []
296 )
297 },
298 )
300 speed_metrics = calculate_speed_metrics(
301 timing_info=speed_profiler.get_summary(),
302 system_info={
303 "iterations": config_iterations,
304 "questions_per_iteration": config_questions_per_iteration,
305 "results": results,
306 },
307 )
309 resource_metrics = calculate_resource_metrics(
310 resource_info=resource_monitor.get_combined_stats(),
311 system_info={
312 "iterations": config_iterations,
313 "questions_per_iteration": config_questions_per_iteration,
314 "results": results,
315 },
316 )
318 # Return comprehensive results
319 return {
320 "query": query,
321 "config": config,
322 "success": True,
323 "findings_count": len(results.get("findings", [])),
324 "knowledge_length": len(results.get("current_knowledge", "")),
325 "quality_metrics": quality_metrics,
326 "speed_metrics": speed_metrics,
327 "resource_metrics": resource_metrics,
328 "timing_details": speed_profiler.get_timings(),
329 "resource_details": resource_monitor.get_combined_stats(),
330 }
332 except Exception as e:
333 # Stop profiling on error
334 speed_profiler.stop()
335 resource_monitor.stop()
337 # Log the error
338 logger.exception("Error evaluating configuration")
340 # Return error information
341 return {
342 "query": query,
343 "config": config,
344 "success": False,
345 "error": str(e),
346 "timing_details": speed_profiler.get_timings(),
347 "resource_details": resource_monitor.get_combined_stats(),
348 }
351def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:
352 """
353 Calculate average metrics across multiple runs.
355 Args:
356 results: List of individual run results
358 Returns:
359 Dictionary with averaged metrics
360 """
361 # Check if there are any successful results
362 if not results:
363 return {}
365 # Initialize average metrics
366 avg_metrics = {
367 "quality_metrics": {},
368 "speed_metrics": {},
369 "resource_metrics": {},
370 }
372 # Quality metrics
373 quality_keys = set()
374 for result in results:
375 quality_metrics = result.get("quality_metrics", {})
376 quality_keys.update(quality_metrics.keys())
378 for key in quality_keys:
379 values = [r.get("quality_metrics", {}).get(key) for r in results]
380 values = [v for v in values if v is not None]
381 if values: 381 ↛ 378line 381 didn't jump to line 378 because the condition on line 381 was always true
382 avg_metrics["quality_metrics"][key] = sum(values) / len(values)
384 # Speed metrics
385 speed_keys = set()
386 for result in results:
387 speed_metrics = result.get("speed_metrics", {})
388 speed_keys.update(speed_metrics.keys())
390 for key in speed_keys:
391 values = [r.get("speed_metrics", {}).get(key) for r in results]
392 values = [v for v in values if v is not None]
393 if values: 393 ↛ 390line 393 didn't jump to line 390 because the condition on line 393 was always true
394 avg_metrics["speed_metrics"][key] = sum(values) / len(values)
396 # Resource metrics
397 resource_keys = set()
398 for result in results:
399 resource_metrics = result.get("resource_metrics", {})
400 resource_keys.update(resource_metrics.keys())
402 for key in resource_keys:
403 values = [r.get("resource_metrics", {}).get(key) for r in results]
404 values = [v for v in values if v is not None]
405 if values: 405 ↛ 402line 405 didn't jump to line 402 because the condition on line 405 was always true
406 avg_metrics["resource_metrics"][key] = sum(values) / len(values)
408 return avg_metrics
411def _create_comparison_visualizations(
412 comparison_report: Dict[str, Any], output_dir: str, timestamp: str
413):
414 """
415 Create visualizations for the comparison results.
417 Args:
418 comparison_report: Comparison report dictionary
419 output_dir: Directory to save visualizations
420 timestamp: Timestamp string for filenames
421 """
422 # Check if there are successful results
423 successful_results = [
424 r
425 for r in comparison_report.get("results", [])
426 if r.get("success", False)
427 ]
429 if not successful_results:
430 logger.warning("No successful configurations to visualize")
431 return
433 # Extract configuration names
434 config_names = [
435 r.get("name", f"Config {i + 1}")
436 for i, r in enumerate(successful_results)
437 ]
439 # 1. Overall score comparison
440 plt.figure(figsize=(12, 6))
441 scores = [r.get("overall_score", 0) for r in successful_results]
443 # Create horizontal bar chart
444 plt.barh(config_names, scores, color="skyblue")
445 plt.xlabel("Overall Score")
446 plt.ylabel("Configuration")
447 plt.title("Configuration Performance Comparison")
448 plt.grid(axis="x", linestyle="--", alpha=0.7)
449 plt.tight_layout()
450 plt.savefig(
451 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png")
452 )
453 plt.close()
455 # 2. Quality metrics comparison
456 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"]
457 _create_metric_comparison_chart(
458 successful_results,
459 config_names,
460 quality_metrics,
461 "quality_metrics",
462 "Quality Metrics Comparison",
463 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"),
464 )
466 # 3. Speed metrics comparison
467 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"]
468 _create_metric_comparison_chart(
469 successful_results,
470 config_names,
471 speed_metrics,
472 "speed_metrics",
473 "Speed Metrics Comparison",
474 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"),
475 )
477 # 4. Resource metrics comparison
478 resource_metrics = [
479 "overall_resource",
480 "process_memory_max_mb",
481 "system_cpu_avg",
482 ]
483 _create_metric_comparison_chart(
484 successful_results,
485 config_names,
486 resource_metrics,
487 "resource_metrics",
488 "Resource Usage Comparison",
489 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"),
490 )
492 # 5. Spider chart for multi-dimensional comparison
493 _create_spider_chart(
494 successful_results,
495 config_names,
496 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"),
497 )
499 # 6. Pareto frontier chart for quality vs. speed
500 _create_pareto_chart(
501 successful_results,
502 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"),
503 )
506def _create_metric_comparison_chart(
507 results: List[Dict[str, Any]],
508 config_names: List[str],
509 metric_keys: List[str],
510 metric_category: str,
511 title: str,
512 output_path: str,
513):
514 """
515 Create a chart comparing specific metrics across configurations.
517 Args:
518 results: List of configuration results
519 config_names: Names of configurations
520 metric_keys: Keys of metrics to compare
521 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.)
522 title: Chart title
523 output_path: Path to save the chart
524 """
525 # Create figure with multiple subplots (one per metric)
526 fig, axes = plt.subplots(
527 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys))
528 )
530 # Handle case with only one metric
531 if len(metric_keys) == 1:
532 axes = [axes]
534 for i, metric_key in enumerate(metric_keys):
535 ax = axes[i]
537 # Get metric values
538 metric_values = []
539 for result in results:
540 metrics = result.get("avg_metrics", {}).get(metric_category, {})
541 value = metrics.get(metric_key)
543 # Handle time values for better visualization
544 if "duration" in metric_key and value is not None:
545 # Convert to seconds if > 60 seconds, minutes if > 60 minutes
546 if value > 3600: 546 ↛ 547line 546 didn't jump to line 547 because the condition on line 546 was never true
547 value = value / 3600 # Convert to hours
548 metric_key += " (hours)"
549 elif value > 60: 549 ↛ 550line 549 didn't jump to line 550 because the condition on line 549 was never true
550 value = value / 60 # Convert to minutes
551 metric_key += " (minutes)"
552 else:
553 metric_key += " (seconds)"
555 metric_values.append(value if value is not None else 0)
557 # Create horizontal bar chart
558 bars = ax.barh(config_names, metric_values, color="lightblue")
559 ax.set_xlabel(metric_key.replace("_", " ").title())
560 ax.set_title(f"{metric_key.replace('_', ' ').title()}")
561 ax.grid(axis="x", linestyle="--", alpha=0.7)
563 # Add value labels to bars
564 for bar in bars:
565 width = bar.get_width()
566 label_x_pos = width * 1.01
567 ax.text(
568 label_x_pos,
569 bar.get_y() + bar.get_height() / 2,
570 f"{width:.2f}",
571 va="center",
572 )
574 plt.suptitle(title, fontsize=16)
575 plt.tight_layout()
576 plt.savefig(output_path)
577 plt.close()
580def _create_spider_chart(
581 results: List[Dict[str, Any]], config_names: List[str], output_path: str
582):
583 """
584 Create a spider chart comparing metrics across configurations.
586 Args:
587 results: List of configuration results
588 config_names: Names of configurations
589 output_path: Path to save the chart
590 """
591 # Try to import the radar chart module
592 try:
593 from matplotlib.path import Path
594 from matplotlib.projections import register_projection
595 from matplotlib.projections.polar import PolarAxes
596 from matplotlib.spines import Spine
598 def radar_factory(num_vars, frame="circle"):
599 """Create a radar chart with `num_vars` axes."""
600 # Calculate evenly-spaced axis angles
601 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)
603 class RadarAxes(PolarAxes):
604 name = "radar"
606 def __init__(self, *args, **kwargs):
607 super().__init__(*args, **kwargs)
608 self.set_theta_zero_location("N")
610 def fill(self, *args, closed=True, **kwargs):
611 return super().fill(closed=closed, *args, **kwargs)
613 def plot(self, *args, **kwargs):
614 return super().plot(*args, **kwargs)
616 def set_varlabels(self, labels):
617 self.set_thetagrids(np.degrees(theta), labels)
619 def _gen_axes_patch(self):
620 if frame == "circle":
621 return Circle((0.5, 0.5), 0.5)
622 elif frame == "polygon":
623 return RegularPolygon(
624 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"
625 )
626 else:
627 raise ValueError(
628 "Unknown value for 'frame': %s" % frame
629 )
631 def _gen_axes_spines(self):
632 if frame == "circle":
633 return super()._gen_axes_spines()
634 elif frame == "polygon":
635 spine_type = Spine.circular_spine
636 verts = unit_poly_verts(num_vars)
637 vertices = [(0.5, 0.5)] + verts
638 codes = (
639 [Path.MOVETO]
640 + [Path.LINETO] * num_vars
641 + [Path.CLOSEPOLY]
642 )
643 path = Path(vertices, codes)
644 spine = Spine(self, spine_type, path)
645 spine.set_transform(self.transAxes)
646 return {"polar": spine}
647 else:
648 raise ValueError(
649 "Unknown value for 'frame': %s" % frame
650 )
652 def unit_poly_verts(num_vars):
653 """Return vertices of polygon for radar chart."""
654 verts = []
655 for i in range(num_vars):
656 angle = theta[i]
657 verts.append(
658 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle)))
659 )
660 return verts
662 register_projection(RadarAxes)
663 return theta
665 # Select metrics for the spider chart
666 metrics = [
667 {"name": "Quality", "key": "quality_metrics.overall_quality"},
668 {"name": "Speed", "key": "speed_metrics.overall_speed"},
669 {
670 "name": "Sources",
671 "key": "quality_metrics.normalized_source_count",
672 },
673 {
674 "name": "Content",
675 "key": "quality_metrics.normalized_knowledge_length",
676 },
677 {
678 "name": "Memory",
679 "key": "resource_metrics.normalized_memory_usage",
680 "invert": True,
681 },
682 ]
684 # Extract metric values
685 spoke_labels = [m["name"] for m in metrics]
686 num_vars = len(spoke_labels)
687 theta = radar_factory(num_vars)
689 fig, ax = plt.subplots(
690 figsize=(10, 10), subplot_kw=dict(projection="radar")
691 )
693 # Color map for different configurations
694 colors = plt.cm.viridis(np.linspace(0, 1, len(results)))
696 for i, result in enumerate(results):
697 values = []
698 for metric in metrics:
699 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality")
700 key_parts = metric["key"].split(".")
701 value = result.get("avg_metrics", {})
702 for part in key_parts:
703 value = value.get(part, 0) if isinstance(value, dict) else 0
705 # Invert if needed (for metrics where lower is better)
706 if metric.get("invert", False):
707 value = 1.0 - value
709 values.append(value)
711 # Plot this configuration
712 ax.plot(
713 theta,
714 values,
715 color=colors[i],
716 linewidth=2,
717 label=config_names[i],
718 )
719 ax.fill(theta, values, color=colors[i], alpha=0.25)
721 # Set chart properties
722 ax.set_varlabels(spoke_labels)
723 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1))
724 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05)
725 plt.tight_layout()
727 # Save chart
728 plt.savefig(output_path)
729 plt.close()
731 except Exception as e:
732 logger.exception("Error creating spider chart")
733 # Create a text-based chart as fallback
734 plt.figure(figsize=(10, 6))
735 plt.text(
736 0.5,
737 0.5,
738 f"Spider chart could not be created: {e!s}",
739 horizontalalignment="center",
740 verticalalignment="center",
741 )
742 plt.axis("off")
743 plt.savefig(output_path)
744 plt.close()
747def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str):
748 """
749 Create a Pareto frontier chart showing quality vs. speed tradeoff.
751 Args:
752 results: List of configuration results
753 output_path: Path to save the chart
754 """
755 # Extract quality and speed metrics
756 quality_scores = []
757 speed_scores = []
758 names = []
760 for result in results:
761 metrics = result.get("avg_metrics", {})
762 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0)
764 # For speed, we use inverse of duration (so higher is better)
765 duration = metrics.get("speed_metrics", {}).get("total_duration", 1)
766 speed = 1.0 / max(duration, 0.001) # Avoid division by zero
768 quality_scores.append(quality)
769 speed_scores.append(speed)
770 names.append(result.get("name", "Configuration"))
772 # Create scatter plot
773 plt.figure(figsize=(10, 8))
774 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7)
776 # Add labels for each point
777 for i, name in enumerate(names):
778 plt.annotate(
779 name,
780 (quality_scores[i], speed_scores[i]),
781 xytext=(5, 5),
782 textcoords="offset points",
783 )
785 # Identify Pareto frontier
786 pareto_points = []
787 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)):
788 is_pareto = True
789 for q2, s2 in zip(quality_scores, speed_scores, strict=False):
790 if q2 > q and s2 > s: # Dominated 790 ↛ 791line 790 didn't jump to line 791 because the condition on line 790 was never true
791 is_pareto = False
792 break
793 if is_pareto: 793 ↛ 787line 793 didn't jump to line 787 because the condition on line 793 was always true
794 pareto_points.append(i)
796 # Highlight Pareto frontier
797 pareto_quality = [quality_scores[i] for i in pareto_points]
798 pareto_speed = [speed_scores[i] for i in pareto_points]
800 # Sort pareto points for line drawing
801 pareto_sorted = sorted(
802 zip(pareto_quality, pareto_speed, pareto_points, strict=False)
803 )
804 pareto_quality = [p[0] for p in pareto_sorted]
805 pareto_speed = [p[1] for p in pareto_sorted]
806 pareto_indices = [p[2] for p in pareto_sorted]
808 # Draw Pareto frontier line
809 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2)
811 # Highlight Pareto optimal points
812 plt.scatter(
813 [quality_scores[i] for i in pareto_indices],
814 [speed_scores[i] for i in pareto_indices],
815 s=150,
816 facecolors="none",
817 edgecolors="r",
818 linewidth=2,
819 )
821 # Add labels for Pareto optimal configurations
822 for i in pareto_indices:
823 plt.annotate(
824 names[i],
825 (quality_scores[i], speed_scores[i]),
826 xytext=(8, 8),
827 textcoords="offset points",
828 bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.7),
829 )
831 # Set chart properties
832 plt.xlabel("Quality Score (higher is better)")
833 plt.ylabel("Speed Score (higher is better)")
834 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14)
835 plt.grid(True, linestyle="--", alpha=0.7)
837 # Add explanation
838 plt.figtext(
839 0.5,
840 0.01,
841 "Points on the red line are Pareto optimal configurations\n"
842 "(no other configuration is better in both quality and speed)",
843 ha="center",
844 fontsize=10,
845 bbox=dict(boxstyle="round", fc="white", alpha=0.7),
846 )
848 plt.tight_layout()
849 plt.savefig(output_path)
850 plt.close()