Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 5%

301 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Configuration comparison for Local Deep Research. 

3 

4This module provides functions for comparing different parameter configurations 

5and evaluating their performance across various metrics. 

6""" 

7 

8import os 

9from datetime import datetime, UTC 

10from pathlib import Path 

11from typing import Any, Dict, List, Optional 

12 

13import matplotlib.pyplot as plt 

14import numpy as np 

15from loguru import logger 

16from matplotlib.patches import Circle, RegularPolygon 

17 

18from local_deep_research.benchmarks.efficiency.resource_monitor import ( 

19 ResourceMonitor, 

20) 

21from local_deep_research.benchmarks.efficiency.speed_profiler import ( 

22 SpeedProfiler, 

23) 

24from local_deep_research.benchmarks.optimization.metrics import ( 

25 calculate_combined_score, 

26 calculate_quality_metrics, 

27 calculate_resource_metrics, 

28 calculate_speed_metrics, 

29) 

30from local_deep_research.config.llm_config import get_llm 

31from local_deep_research.config.search_config import get_search 

32from local_deep_research.search_system import AdvancedSearchSystem 

33 

34 

35def compare_configurations( 

36 query: str, 

37 configurations: List[Dict[str, Any]], 

38 output_dir: str = "comparison_results", 

39 model_name: Optional[str] = None, 

40 provider: Optional[str] = None, 

41 search_tool: Optional[str] = None, 

42 repetitions: int = 1, 

43 metric_weights: Optional[Dict[str, float]] = None, 

44) -> Dict[str, Any]: 

45 """ 

46 Compare multiple parameter configurations. 

47 

48 Args: 

49 query: Research query to use for evaluation 

50 configurations: List of parameter configurations to compare 

51 output_dir: Directory to save comparison results 

52 model_name: Name of the LLM model to use 

53 provider: LLM provider 

54 search_tool: Search engine to use 

55 repetitions: Number of repetitions for each configuration 

56 metric_weights: Dictionary of weights for each metric type 

57 

58 Returns: 

59 Dictionary with comparison results 

60 """ 

61 os.makedirs(output_dir, exist_ok=True) 

62 

63 # Default metric weights if not provided 

64 if metric_weights is None: 

65 metric_weights = { 

66 "quality": 0.6, 

67 "speed": 0.4, 

68 "resource": 0.0, # Disabled by default 

69 } 

70 

71 # Verify valid configurations 

72 if not configurations: 

73 logger.error("No configurations provided for comparison") 

74 return {"error": "No configurations provided"} 

75 

76 # Results storage 

77 results = [] 

78 

79 # Process each configuration 

80 for i, config in enumerate(configurations): 

81 logger.info( 

82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}" 

83 ) 

84 

85 # Name for this configuration 

86 config_name = config.get("name", f"Configuration {i + 1}") 

87 

88 # Results for all repetitions of this configuration 

89 config_results = [] 

90 

91 # Run multiple repetitions 

92 for rep in range(repetitions): 

93 logger.info( 

94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}" 

95 ) 

96 

97 try: 

98 # Run the configuration 

99 result = _evaluate_single_configuration( 

100 query=query, 

101 config=config, 

102 model_name=model_name, 

103 provider=provider, 

104 search_tool=search_tool, 

105 ) 

106 

107 config_results.append(result) 

108 logger.info(f"Completed repetition {rep + 1} for {config_name}") 

109 

110 except Exception as e: 

111 logger.exception( 

112 f"Error in {config_name}, repetition {rep + 1}: {e!s}" 

113 ) 

114 # Add error info but continue with other configurations 

115 config_results.append({"error": str(e), "success": False}) 

116 

117 # Calculate aggregate metrics across repetitions 

118 if config_results: 

119 # Filter out failed runs 

120 successful_runs = [ 

121 r for r in config_results if r.get("success", False) 

122 ] 

123 

124 if successful_runs: 

125 # Calculate average metrics 

126 avg_metrics = _calculate_average_metrics(successful_runs) 

127 

128 # Calculate overall score 

129 overall_score = calculate_combined_score( 

130 quality_metrics=avg_metrics.get("quality_metrics", {}), 

131 speed_metrics=avg_metrics.get("speed_metrics", {}), 

132 resource_metrics=avg_metrics.get("resource_metrics", {}), 

133 weights=metric_weights, 

134 ) 

135 

136 result_summary = { 

137 "name": config_name, 

138 "configuration": config, 

139 "success": True, 

140 "runs_completed": len(successful_runs), 

141 "runs_failed": len(config_results) - len(successful_runs), 

142 "avg_metrics": avg_metrics, 

143 "overall_score": overall_score, 

144 "individual_results": config_results, 

145 } 

146 else: 

147 # All runs failed 

148 result_summary = { 

149 "name": config_name, 

150 "configuration": config, 

151 "success": False, 

152 "runs_completed": 0, 

153 "runs_failed": len(config_results), 

154 "error": "All runs failed", 

155 "individual_results": config_results, 

156 } 

157 

158 results.append(result_summary) 

159 

160 # Sort results by overall score (if available) 

161 sorted_results = sorted( 

162 [r for r in results if r.get("success", False)], 

163 key=lambda x: x.get("overall_score", 0), 

164 reverse=True, 

165 ) 

166 

167 # Add failed configurations at the end 

168 sorted_results.extend([r for r in results if not r.get("success", False)]) 

169 

170 # Create comparison report 

171 comparison_report = { 

172 "query": query, 

173 "configurations_tested": len(configurations), 

174 "successful_configurations": len( 

175 [r for r in results if r.get("success", False)] 

176 ), 

177 "failed_configurations": len( 

178 [r for r in results if not r.get("success", False)] 

179 ), 

180 "repetitions": repetitions, 

181 "metric_weights": metric_weights, 

182 "timestamp": datetime.now(UTC).isoformat(), 

183 "results": sorted_results, 

184 } 

185 

186 # Save results to file 

187 from ...security.file_write_verifier import write_json_verified 

188 

189 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

190 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json") 

191 

192 write_json_verified( 

193 result_file, 

194 comparison_report, 

195 "benchmark.allow_file_output", 

196 context="comparison results", 

197 ) 

198 

199 # Generate visualizations 

200 visualizations_dir = Path(output_dir) / "visualizations" 

201 visualizations_dir.mkdir(parents=True, exist_ok=True) 

202 visualizations_dir = str(visualizations_dir) 

203 

204 _create_comparison_visualizations( 

205 comparison_report, output_dir=visualizations_dir, timestamp=timestamp 

206 ) 

207 

208 logger.info(f"Comparison completed. Results saved to {result_file}") 

209 

210 # Add report path to the result 

211 comparison_report["report_path"] = result_file 

212 

213 return comparison_report 

214 

215 

216def _evaluate_single_configuration( 

217 query: str, 

218 config: Dict[str, Any], 

219 model_name: Optional[str] = None, 

220 provider: Optional[str] = None, 

221 search_tool: Optional[str] = None, 

222) -> Dict[str, Any]: 

223 """ 

224 Evaluate a single configuration. 

225 

226 Args: 

227 query: Research query to evaluate 

228 config: Configuration parameters 

229 model_name: Name of the LLM model to use 

230 provider: LLM provider 

231 search_tool: Search engine to use 

232 

233 Returns: 

234 Dictionary with evaluation results 

235 """ 

236 # Extract configuration parameters 

237 config_model_name = config.get("model_name", model_name) 

238 config_provider = config.get("provider", provider) 

239 config_search_tool = config.get("search_tool", search_tool) 

240 config_iterations = config.get("iterations", 2) 

241 config_questions_per_iteration = config.get("questions_per_iteration", 2) 

242 config_search_strategy = config.get("search_strategy", "iterdrag") 

243 config_max_results = config.get("max_results", 50) 

244 config_max_filtered_results = config.get("max_filtered_results", 20) 

245 

246 # Initialize profiling tools 

247 speed_profiler = SpeedProfiler() 

248 resource_monitor = ResourceMonitor(sampling_interval=0.5) 

249 

250 # Start profiling 

251 speed_profiler.start() 

252 resource_monitor.start() 

253 

254 try: 

255 # Get LLM 

256 with speed_profiler.timer("llm_initialization"): 

257 llm = get_llm( 

258 temperature=config.get("temperature", 0.7), 

259 model_name=config_model_name, 

260 provider=config_provider, 

261 ) 

262 

263 # Set up search engine if specified 

264 with speed_profiler.timer("search_initialization"): 

265 search = None 

266 if config_search_tool: 

267 search = get_search( 

268 config_search_tool, 

269 llm_instance=llm, 

270 max_results=config_max_results, 

271 max_filtered_results=config_max_filtered_results, 

272 ) 

273 

274 # Create search system 

275 system = AdvancedSearchSystem(llm=llm, search=search) 

276 system.max_iterations = config_iterations 

277 system.questions_per_iteration = config_questions_per_iteration 

278 system.strategy_name = config_search_strategy 

279 

280 # Run the analysis 

281 with speed_profiler.timer("analysis"): 

282 results = system.analyze_topic(query) 

283 

284 # Stop profiling 

285 speed_profiler.stop() 

286 resource_monitor.stop() 

287 

288 # Calculate metrics 

289 quality_metrics = calculate_quality_metrics( 

290 results=results, 

291 system_info={ 

292 "all_links_of_system": getattr( 

293 system, "all_links_of_system", [] 

294 ) 

295 }, 

296 ) 

297 

298 speed_metrics = calculate_speed_metrics( 

299 timing_info=speed_profiler.get_summary(), 

300 system_info={ 

301 "iterations": config_iterations, 

302 "questions_per_iteration": config_questions_per_iteration, 

303 "results": results, 

304 }, 

305 ) 

306 

307 resource_metrics = calculate_resource_metrics( 

308 resource_info=resource_monitor.get_combined_stats(), 

309 system_info={ 

310 "iterations": config_iterations, 

311 "questions_per_iteration": config_questions_per_iteration, 

312 "results": results, 

313 }, 

314 ) 

315 

316 # Return comprehensive results 

317 return { 

318 "query": query, 

319 "config": config, 

320 "success": True, 

321 "findings_count": len(results.get("findings", [])), 

322 "knowledge_length": len(results.get("current_knowledge", "")), 

323 "quality_metrics": quality_metrics, 

324 "speed_metrics": speed_metrics, 

325 "resource_metrics": resource_metrics, 

326 "timing_details": speed_profiler.get_timings(), 

327 "resource_details": resource_monitor.get_combined_stats(), 

328 } 

329 

330 except Exception as e: 

331 # Stop profiling on error 

332 speed_profiler.stop() 

333 resource_monitor.stop() 

334 

335 # Log the error 

336 logger.exception("Error evaluating configuration") 

337 

338 # Return error information 

339 return { 

340 "query": query, 

341 "config": config, 

342 "success": False, 

343 "error": str(e), 

344 "timing_details": speed_profiler.get_timings(), 

345 "resource_details": resource_monitor.get_combined_stats(), 

346 } 

347 

348 

349def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]: 

350 """ 

351 Calculate average metrics across multiple runs. 

352 

353 Args: 

354 results: List of individual run results 

355 

356 Returns: 

357 Dictionary with averaged metrics 

358 """ 

359 # Check if there are any successful results 

360 if not results: 

361 return {} 

362 

363 # Initialize average metrics 

364 avg_metrics = { 

365 "quality_metrics": {}, 

366 "speed_metrics": {}, 

367 "resource_metrics": {}, 

368 } 

369 

370 # Quality metrics 

371 quality_keys = set() 

372 for result in results: 

373 quality_metrics = result.get("quality_metrics", {}) 

374 quality_keys.update(quality_metrics.keys()) 

375 

376 for key in quality_keys: 

377 values = [r.get("quality_metrics", {}).get(key) for r in results] 

378 values = [v for v in values if v is not None] 

379 if values: 

380 avg_metrics["quality_metrics"][key] = sum(values) / len(values) 

381 

382 # Speed metrics 

383 speed_keys = set() 

384 for result in results: 

385 speed_metrics = result.get("speed_metrics", {}) 

386 speed_keys.update(speed_metrics.keys()) 

387 

388 for key in speed_keys: 

389 values = [r.get("speed_metrics", {}).get(key) for r in results] 

390 values = [v for v in values if v is not None] 

391 if values: 

392 avg_metrics["speed_metrics"][key] = sum(values) / len(values) 

393 

394 # Resource metrics 

395 resource_keys = set() 

396 for result in results: 

397 resource_metrics = result.get("resource_metrics", {}) 

398 resource_keys.update(resource_metrics.keys()) 

399 

400 for key in resource_keys: 

401 values = [r.get("resource_metrics", {}).get(key) for r in results] 

402 values = [v for v in values if v is not None] 

403 if values: 

404 avg_metrics["resource_metrics"][key] = sum(values) / len(values) 

405 

406 return avg_metrics 

407 

408 

409def _create_comparison_visualizations( 

410 comparison_report: Dict[str, Any], output_dir: str, timestamp: str 

411): 

412 """ 

413 Create visualizations for the comparison results. 

414 

415 Args: 

416 comparison_report: Comparison report dictionary 

417 output_dir: Directory to save visualizations 

418 timestamp: Timestamp string for filenames 

419 """ 

420 # Check if there are successful results 

421 successful_results = [ 

422 r 

423 for r in comparison_report.get("results", []) 

424 if r.get("success", False) 

425 ] 

426 

427 if not successful_results: 

428 logger.warning("No successful configurations to visualize") 

429 return 

430 

431 # Extract configuration names 

432 config_names = [ 

433 r.get("name", f"Config {i + 1}") 

434 for i, r in enumerate(successful_results) 

435 ] 

436 

437 # 1. Overall score comparison 

438 plt.figure(figsize=(12, 6)) 

439 scores = [r.get("overall_score", 0) for r in successful_results] 

440 

441 # Create horizontal bar chart 

442 plt.barh(config_names, scores, color="skyblue") 

443 plt.xlabel("Overall Score") 

444 plt.ylabel("Configuration") 

445 plt.title("Configuration Performance Comparison") 

446 plt.grid(axis="x", linestyle="--", alpha=0.7) 

447 plt.tight_layout() 

448 plt.savefig( 

449 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png") 

450 ) 

451 plt.close() 

452 

453 # 2. Quality metrics comparison 

454 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"] 

455 _create_metric_comparison_chart( 

456 successful_results, 

457 config_names, 

458 quality_metrics, 

459 "quality_metrics", 

460 "Quality Metrics Comparison", 

461 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"), 

462 ) 

463 

464 # 3. Speed metrics comparison 

465 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"] 

466 _create_metric_comparison_chart( 

467 successful_results, 

468 config_names, 

469 speed_metrics, 

470 "speed_metrics", 

471 "Speed Metrics Comparison", 

472 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"), 

473 ) 

474 

475 # 4. Resource metrics comparison 

476 resource_metrics = [ 

477 "overall_resource", 

478 "process_memory_max_mb", 

479 "system_cpu_avg", 

480 ] 

481 _create_metric_comparison_chart( 

482 successful_results, 

483 config_names, 

484 resource_metrics, 

485 "resource_metrics", 

486 "Resource Usage Comparison", 

487 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"), 

488 ) 

489 

490 # 5. Spider chart for multi-dimensional comparison 

491 _create_spider_chart( 

492 successful_results, 

493 config_names, 

494 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"), 

495 ) 

496 

497 # 6. Pareto frontier chart for quality vs. speed 

498 _create_pareto_chart( 

499 successful_results, 

500 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"), 

501 ) 

502 

503 

504def _create_metric_comparison_chart( 

505 results: List[Dict[str, Any]], 

506 config_names: List[str], 

507 metric_keys: List[str], 

508 metric_category: str, 

509 title: str, 

510 output_path: str, 

511): 

512 """ 

513 Create a chart comparing specific metrics across configurations. 

514 

515 Args: 

516 results: List of configuration results 

517 config_names: Names of configurations 

518 metric_keys: Keys of metrics to compare 

519 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.) 

520 title: Chart title 

521 output_path: Path to save the chart 

522 """ 

523 # Create figure with multiple subplots (one per metric) 

524 fig, axes = plt.subplots( 

525 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys)) 

526 ) 

527 

528 # Handle case with only one metric 

529 if len(metric_keys) == 1: 

530 axes = [axes] 

531 

532 for i, metric_key in enumerate(metric_keys): 

533 ax = axes[i] 

534 

535 # Get metric values 

536 metric_values = [] 

537 for result in results: 

538 metrics = result.get("avg_metrics", {}).get(metric_category, {}) 

539 value = metrics.get(metric_key) 

540 

541 # Handle time values for better visualization 

542 if "duration" in metric_key and value is not None: 

543 # Convert to seconds if > 60 seconds, minutes if > 60 minutes 

544 if value > 3600: 

545 value = value / 3600 # Convert to hours 

546 metric_key += " (hours)" 

547 elif value > 60: 

548 value = value / 60 # Convert to minutes 

549 metric_key += " (minutes)" 

550 else: 

551 metric_key += " (seconds)" 

552 

553 metric_values.append(value if value is not None else 0) 

554 

555 # Create horizontal bar chart 

556 bars = ax.barh(config_names, metric_values, color="lightblue") 

557 ax.set_xlabel(metric_key.replace("_", " ").title()) 

558 ax.set_title(f"{metric_key.replace('_', ' ').title()}") 

559 ax.grid(axis="x", linestyle="--", alpha=0.7) 

560 

561 # Add value labels to bars 

562 for bar in bars: 

563 width = bar.get_width() 

564 label_x_pos = width * 1.01 

565 ax.text( 

566 label_x_pos, 

567 bar.get_y() + bar.get_height() / 2, 

568 f"{width:.2f}", 

569 va="center", 

570 ) 

571 

572 plt.suptitle(title, fontsize=16) 

573 plt.tight_layout() 

574 plt.savefig(output_path) 

575 plt.close() 

576 

577 

578def _create_spider_chart( 

579 results: List[Dict[str, Any]], config_names: List[str], output_path: str 

580): 

581 """ 

582 Create a spider chart comparing metrics across configurations. 

583 

584 Args: 

585 results: List of configuration results 

586 config_names: Names of configurations 

587 output_path: Path to save the chart 

588 """ 

589 # Try to import the radar chart module 

590 try: 

591 from matplotlib.path import Path 

592 from matplotlib.projections import register_projection 

593 from matplotlib.projections.polar import PolarAxes 

594 from matplotlib.spines import Spine 

595 

596 def radar_factory(num_vars, frame="circle"): 

597 """Create a radar chart with `num_vars` axes.""" 

598 # Calculate evenly-spaced axis angles 

599 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) 

600 

601 class RadarAxes(PolarAxes): 

602 name = "radar" 

603 

604 def __init__(self, *args, **kwargs): 

605 super().__init__(*args, **kwargs) 

606 self.set_theta_zero_location("N") 

607 

608 def fill(self, *args, closed=True, **kwargs): 

609 return super().fill(closed=closed, *args, **kwargs) 

610 

611 def plot(self, *args, **kwargs): 

612 return super().plot(*args, **kwargs) 

613 

614 def set_varlabels(self, labels): 

615 self.set_thetagrids(np.degrees(theta), labels) 

616 

617 def _gen_axes_patch(self): 

618 if frame == "circle": 

619 return Circle((0.5, 0.5), 0.5) 

620 elif frame == "polygon": 

621 return RegularPolygon( 

622 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k" 

623 ) 

624 else: 

625 raise ValueError( 

626 "Unknown value for 'frame': %s" % frame 

627 ) 

628 

629 def _gen_axes_spines(self): 

630 if frame == "circle": 

631 return super()._gen_axes_spines() 

632 elif frame == "polygon": 

633 spine_type = Spine.circular_spine 

634 verts = unit_poly_verts(num_vars) 

635 vertices = [(0.5, 0.5)] + verts 

636 codes = ( 

637 [Path.MOVETO] 

638 + [Path.LINETO] * num_vars 

639 + [Path.CLOSEPOLY] 

640 ) 

641 path = Path(vertices, codes) 

642 spine = Spine(self, spine_type, path) 

643 spine.set_transform(self.transAxes) 

644 return {"polar": spine} 

645 else: 

646 raise ValueError( 

647 "Unknown value for 'frame': %s" % frame 

648 ) 

649 

650 def unit_poly_verts(num_vars): 

651 """Return vertices of polygon for radar chart.""" 

652 verts = [] 

653 for i in range(num_vars): 

654 angle = theta[i] 

655 verts.append( 

656 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle))) 

657 ) 

658 return verts 

659 

660 register_projection(RadarAxes) 

661 return theta 

662 

663 # Select metrics for the spider chart 

664 metrics = [ 

665 {"name": "Quality", "key": "quality_metrics.overall_quality"}, 

666 {"name": "Speed", "key": "speed_metrics.overall_speed"}, 

667 { 

668 "name": "Sources", 

669 "key": "quality_metrics.normalized_source_count", 

670 }, 

671 { 

672 "name": "Content", 

673 "key": "quality_metrics.normalized_knowledge_length", 

674 }, 

675 { 

676 "name": "Memory", 

677 "key": "resource_metrics.normalized_memory_usage", 

678 "invert": True, 

679 }, 

680 ] 

681 

682 # Extract metric values 

683 spoke_labels = [m["name"] for m in metrics] 

684 num_vars = len(spoke_labels) 

685 theta = radar_factory(num_vars) 

686 

687 fig, ax = plt.subplots( 

688 figsize=(10, 10), subplot_kw=dict(projection="radar") 

689 ) 

690 

691 # Color map for different configurations 

692 colors = plt.cm.viridis(np.linspace(0, 1, len(results))) 

693 

694 for i, result in enumerate(results): 

695 values = [] 

696 for metric in metrics: 

697 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality") 

698 key_parts = metric["key"].split(".") 

699 value = result.get("avg_metrics", {}) 

700 for part in key_parts: 

701 value = value.get(part, 0) if isinstance(value, dict) else 0 

702 

703 # Invert if needed (for metrics where lower is better) 

704 if metric.get("invert", False): 

705 value = 1.0 - value 

706 

707 values.append(value) 

708 

709 # Plot this configuration 

710 ax.plot( 

711 theta, 

712 values, 

713 color=colors[i], 

714 linewidth=2, 

715 label=config_names[i], 

716 ) 

717 ax.fill(theta, values, color=colors[i], alpha=0.25) 

718 

719 # Set chart properties 

720 ax.set_varlabels(spoke_labels) 

721 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1)) 

722 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05) 

723 plt.tight_layout() 

724 

725 # Save chart 

726 plt.savefig(output_path) 

727 plt.close() 

728 

729 except Exception as e: 

730 logger.exception("Error creating spider chart") 

731 # Create a text-based chart as fallback 

732 plt.figure(figsize=(10, 6)) 

733 plt.text( 

734 0.5, 

735 0.5, 

736 f"Spider chart could not be created: {e!s}", 

737 horizontalalignment="center", 

738 verticalalignment="center", 

739 ) 

740 plt.axis("off") 

741 plt.savefig(output_path) 

742 plt.close() 

743 

744 

745def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str): 

746 """ 

747 Create a Pareto frontier chart showing quality vs. speed tradeoff. 

748 

749 Args: 

750 results: List of configuration results 

751 output_path: Path to save the chart 

752 """ 

753 # Extract quality and speed metrics 

754 quality_scores = [] 

755 speed_scores = [] 

756 names = [] 

757 

758 for result in results: 

759 metrics = result.get("avg_metrics", {}) 

760 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0) 

761 

762 # For speed, we use inverse of duration (so higher is better) 

763 duration = metrics.get("speed_metrics", {}).get("total_duration", 1) 

764 speed = 1.0 / max(duration, 0.001) # Avoid division by zero 

765 

766 quality_scores.append(quality) 

767 speed_scores.append(speed) 

768 names.append(result.get("name", "Configuration")) 

769 

770 # Create scatter plot 

771 plt.figure(figsize=(10, 8)) 

772 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7) 

773 

774 # Add labels for each point 

775 for i, name in enumerate(names): 

776 plt.annotate( 

777 name, 

778 (quality_scores[i], speed_scores[i]), 

779 xytext=(5, 5), 

780 textcoords="offset points", 

781 ) 

782 

783 # Identify Pareto frontier 

784 pareto_points = [] 

785 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)): 

786 is_pareto = True 

787 for q2, s2 in zip(quality_scores, speed_scores, strict=False): 

788 if q2 > q and s2 > s: # Dominated 

789 is_pareto = False 

790 break 

791 if is_pareto: 

792 pareto_points.append(i) 

793 

794 # Highlight Pareto frontier 

795 pareto_quality = [quality_scores[i] for i in pareto_points] 

796 pareto_speed = [speed_scores[i] for i in pareto_points] 

797 

798 # Sort pareto points for line drawing 

799 pareto_sorted = sorted( 

800 zip(pareto_quality, pareto_speed, pareto_points, strict=False) 

801 ) 

802 pareto_quality = [p[0] for p in pareto_sorted] 

803 pareto_speed = [p[1] for p in pareto_sorted] 

804 pareto_indices = [p[2] for p in pareto_sorted] 

805 

806 # Draw Pareto frontier line 

807 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2) 

808 

809 # Highlight Pareto optimal points 

810 plt.scatter( 

811 [quality_scores[i] for i in pareto_indices], 

812 [speed_scores[i] for i in pareto_indices], 

813 s=150, 

814 facecolors="none", 

815 edgecolors="r", 

816 linewidth=2, 

817 ) 

818 

819 # Add labels for Pareto optimal configurations 

820 for i in pareto_indices: 

821 plt.annotate( 

822 names[i], 

823 (quality_scores[i], speed_scores[i]), 

824 xytext=(8, 8), 

825 textcoords="offset points", 

826 bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.7), 

827 ) 

828 

829 # Set chart properties 

830 plt.xlabel("Quality Score (higher is better)") 

831 plt.ylabel("Speed Score (higher is better)") 

832 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14) 

833 plt.grid(True, linestyle="--", alpha=0.7) 

834 

835 # Add explanation 

836 plt.figtext( 

837 0.5, 

838 0.01, 

839 "Points on the red line are Pareto optimal configurations\n" 

840 "(no other configuration is better in both quality and speed)", 

841 ha="center", 

842 fontsize=10, 

843 bbox=dict(boxstyle="round", fc="white", alpha=0.7), 

844 ) 

845 

846 plt.tight_layout() 

847 plt.savefig(output_path) 

848 plt.close()