Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 89%

304 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Configuration comparison for Local Deep Research. 

3 

4This module provides functions for comparing different parameter configurations 

5and evaluating their performance across various metrics. 

6""" 

7 

8import os 

9from datetime import datetime, UTC 

10from pathlib import Path 

11from typing import Any, Dict, List, Optional 

12 

13import matplotlib.pyplot as plt 

14import numpy as np 

15from loguru import logger 

16from matplotlib.patches import Circle, RegularPolygon 

17 

18from local_deep_research.benchmarks.efficiency.resource_monitor import ( 

19 ResourceMonitor, 

20) 

21from local_deep_research.benchmarks.efficiency.speed_profiler import ( 

22 SpeedProfiler, 

23) 

24from local_deep_research.benchmarks.optimization.metrics import ( 

25 calculate_combined_score, 

26 calculate_quality_metrics, 

27 calculate_resource_metrics, 

28 calculate_speed_metrics, 

29) 

30from local_deep_research.config.llm_config import get_llm 

31from local_deep_research.config.search_config import get_search 

32from local_deep_research.search_system import AdvancedSearchSystem 

33 

34 

35def compare_configurations( 

36 query: str, 

37 configurations: List[Dict[str, Any]], 

38 output_dir: str = "comparison_results", 

39 model_name: Optional[str] = None, 

40 provider: Optional[str] = None, 

41 search_tool: Optional[str] = None, 

42 repetitions: int = 1, 

43 metric_weights: Optional[Dict[str, float]] = None, 

44) -> Dict[str, Any]: 

45 """ 

46 Compare multiple parameter configurations. 

47 

48 Args: 

49 query: Research query to use for evaluation 

50 configurations: List of parameter configurations to compare 

51 output_dir: Directory to save comparison results 

52 model_name: Name of the LLM model to use 

53 provider: LLM provider 

54 search_tool: Search engine to use 

55 repetitions: Number of repetitions for each configuration 

56 metric_weights: Dictionary of weights for each metric type 

57 

58 Returns: 

59 Dictionary with comparison results 

60 """ 

61 os.makedirs(output_dir, exist_ok=True) 

62 

63 # Default metric weights if not provided 

64 if metric_weights is None: 

65 metric_weights = { 

66 "quality": 0.6, 

67 "speed": 0.4, 

68 "resource": 0.0, # Disabled by default 

69 } 

70 

71 # Verify valid configurations 

72 if not configurations: 

73 logger.error("No configurations provided for comparison") 

74 return {"error": "No configurations provided"} 

75 

76 # Results storage 

77 results = [] 

78 

79 # Process each configuration 

80 for i, config in enumerate(configurations): 

81 logger.info( 

82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}" 

83 ) 

84 

85 # Name for this configuration 

86 config_name = config.get("name", f"Configuration {i + 1}") 

87 

88 # Results for all repetitions of this configuration 

89 config_results = [] 

90 

91 # Run multiple repetitions 

92 for rep in range(repetitions): 

93 logger.info( 

94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}" 

95 ) 

96 

97 try: 

98 # Run the configuration 

99 result = _evaluate_single_configuration( 

100 query=query, 

101 config=config, 

102 model_name=model_name, 

103 provider=provider, 

104 search_tool=search_tool, 

105 ) 

106 

107 config_results.append(result) 

108 logger.info(f"Completed repetition {rep + 1} for {config_name}") 

109 

110 except Exception as e: 

111 logger.exception( 

112 f"Error in {config_name}, repetition {rep + 1}" 

113 ) 

114 # Add error info but continue with other configurations 

115 config_results.append({"error": str(e), "success": False}) 

116 

117 # Calculate aggregate metrics across repetitions 

118 if config_results: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true

119 # Filter out failed runs 

120 successful_runs = [ 

121 r for r in config_results if r.get("success", False) 

122 ] 

123 

124 if successful_runs: 

125 # Calculate average metrics 

126 avg_metrics = _calculate_average_metrics(successful_runs) 

127 

128 # Calculate overall score 

129 overall_score = calculate_combined_score( 

130 metrics={ 

131 "quality": avg_metrics.get("quality_metrics", {}), 

132 "speed": avg_metrics.get("speed_metrics", {}), 

133 "resource": avg_metrics.get("resource_metrics", {}), 

134 }, 

135 weights=metric_weights, 

136 ) 

137 

138 result_summary = { 

139 "name": config_name, 

140 "configuration": config, 

141 "success": True, 

142 "runs_completed": len(successful_runs), 

143 "runs_failed": len(config_results) - len(successful_runs), 

144 "avg_metrics": avg_metrics, 

145 "overall_score": overall_score, 

146 "individual_results": config_results, 

147 } 

148 else: 

149 # All runs failed 

150 result_summary = { 

151 "name": config_name, 

152 "configuration": config, 

153 "success": False, 

154 "runs_completed": 0, 

155 "runs_failed": len(config_results), 

156 "error": "All runs failed", 

157 "individual_results": config_results, 

158 } 

159 

160 results.append(result_summary) 

161 

162 # Sort results by overall score (if available) 

163 sorted_results = sorted( 

164 [r for r in results if r.get("success", False)], 

165 key=lambda x: x.get("overall_score", 0), 

166 reverse=True, 

167 ) 

168 

169 # Add failed configurations at the end 

170 sorted_results.extend([r for r in results if not r.get("success", False)]) 

171 

172 # Create comparison report 

173 comparison_report = { 

174 "query": query, 

175 "configurations_tested": len(configurations), 

176 "successful_configurations": len( 

177 [r for r in results if r.get("success", False)] 

178 ), 

179 "failed_configurations": len( 

180 [r for r in results if not r.get("success", False)] 

181 ), 

182 "repetitions": repetitions, 

183 "metric_weights": metric_weights, 

184 "timestamp": datetime.now(UTC).isoformat(), 

185 "results": sorted_results, 

186 } 

187 

188 # Save results to file 

189 from ...security.file_write_verifier import write_json_verified 

190 

191 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

192 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json") 

193 

194 write_json_verified( 

195 result_file, 

196 comparison_report, 

197 "benchmark.allow_file_output", 

198 context="comparison results", 

199 ) 

200 

201 # Generate visualizations 

202 _viz_dir_path = Path(output_dir) / "visualizations" 

203 _viz_dir_path.mkdir(parents=True, exist_ok=True) 

204 visualizations_dir = str(_viz_dir_path) 

205 

206 _create_comparison_visualizations( 

207 comparison_report, output_dir=visualizations_dir, timestamp=timestamp 

208 ) 

209 

210 logger.info(f"Comparison completed. Results saved to {result_file}") 

211 

212 # Add report path to the result 

213 comparison_report["report_path"] = result_file 

214 

215 return comparison_report 

216 

217 

218def _evaluate_single_configuration( 

219 query: str, 

220 config: Dict[str, Any], 

221 model_name: Optional[str] = None, 

222 provider: Optional[str] = None, 

223 search_tool: Optional[str] = None, 

224) -> Dict[str, Any]: 

225 """ 

226 Evaluate a single configuration. 

227 

228 Args: 

229 query: Research query to evaluate 

230 config: Configuration parameters 

231 model_name: Name of the LLM model to use 

232 provider: LLM provider 

233 search_tool: Search engine to use 

234 

235 Returns: 

236 Dictionary with evaluation results 

237 """ 

238 # Extract configuration parameters 

239 config_model_name = config.get("model_name", model_name) 

240 config_provider = config.get("provider", provider) 

241 config_search_tool = config.get("search_tool", search_tool) 

242 config_iterations = config.get("iterations", 2) 

243 config_questions_per_iteration = config.get("questions_per_iteration", 2) 

244 config_search_strategy = config.get("search_strategy", "iterdrag") 

245 

246 # Initialize profiling tools 

247 speed_profiler = SpeedProfiler() 

248 resource_monitor = ResourceMonitor(sampling_interval=0.5) 

249 

250 # Start profiling 

251 speed_profiler.start() 

252 resource_monitor.start() 

253 

254 llm = None 

255 search = None 

256 system = None 

257 try: 

258 # Get LLM 

259 with speed_profiler.timer("llm_initialization"): 

260 llm = get_llm( 

261 temperature=config.get("temperature", 0.7), 

262 model_name=config_model_name, 

263 provider=config_provider, 

264 ) 

265 

266 # Set up search engine if specified 

267 with speed_profiler.timer("search_initialization"): 

268 search = None 

269 if config_search_tool: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 search = get_search( 

271 config_search_tool, 

272 llm_instance=llm, 

273 ) 

274 

275 # Create search system 

276 system = AdvancedSearchSystem( # type: ignore[call-arg] 

277 llm=llm, 

278 search=search, # type: ignore[arg-type] 

279 max_iterations=config_iterations, 

280 questions_per_iteration=config_questions_per_iteration, 

281 strategy_name=config_search_strategy, 

282 ) 

283 

284 # Run the analysis 

285 with speed_profiler.timer("analysis"): 

286 results = system.analyze_topic(query) 

287 

288 # Stop profiling 

289 speed_profiler.stop() 

290 resource_monitor.stop() 

291 

292 # Calculate metrics 

293 _system_config = dict(config) 

294 quality_metrics = calculate_quality_metrics( 

295 system_config=_system_config 

296 ) 

297 

298 speed_metrics = calculate_speed_metrics(system_config=_system_config) 

299 

300 resource_metrics = calculate_resource_metrics( 

301 system_config=_system_config 

302 ) 

303 

304 # Return comprehensive results 

305 return { 

306 "query": query, 

307 "config": config, 

308 "success": True, 

309 "findings_count": len(results.get("findings", [])), 

310 "knowledge_length": len(results.get("current_knowledge", "")), 

311 "quality_metrics": quality_metrics, 

312 "speed_metrics": speed_metrics, 

313 "resource_metrics": resource_metrics, 

314 "timing_details": speed_profiler.get_timings(), 

315 "resource_details": resource_monitor.get_combined_stats(), 

316 } 

317 

318 except Exception as e: 

319 # Stop profiling on error 

320 speed_profiler.stop() 

321 resource_monitor.stop() 

322 

323 # Log the error 

324 logger.exception("Error evaluating configuration") 

325 

326 # Return error information 

327 return { 

328 "query": query, 

329 "config": config, 

330 "success": False, 

331 "error": str(e), 

332 "timing_details": speed_profiler.get_timings(), 

333 "resource_details": resource_monitor.get_combined_stats(), 

334 } 

335 finally: 

336 from ...utilities.resource_utils import safe_close 

337 

338 safe_close(system, "evaluator system") 

339 safe_close(search, "evaluator search engine") 

340 safe_close(llm, "evaluator LLM") 

341 

342 

343def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]: 

344 """ 

345 Calculate average metrics across multiple runs. 

346 

347 Args: 

348 results: List of individual run results 

349 

350 Returns: 

351 Dictionary with averaged metrics 

352 """ 

353 # Check if there are any successful results 

354 if not results: 

355 return {} 

356 

357 # Initialize average metrics 

358 avg_metrics: Dict[str, Any] = { 

359 "quality_metrics": {}, 

360 "speed_metrics": {}, 

361 "resource_metrics": {}, 

362 } 

363 

364 # Quality metrics 

365 quality_keys = set() 

366 for result in results: 

367 quality_metrics = result.get("quality_metrics", {}) 

368 quality_keys.update(quality_metrics.keys()) 

369 

370 for key in quality_keys: 

371 values = [r.get("quality_metrics", {}).get(key) for r in results] 

372 values = [v for v in values if v is not None] 

373 if values: 373 ↛ 370line 373 didn't jump to line 370 because the condition on line 373 was always true

374 avg_metrics["quality_metrics"][key] = sum(values) / len(values) 

375 

376 # Speed metrics 

377 speed_keys = set() 

378 for result in results: 

379 speed_metrics = result.get("speed_metrics", {}) 

380 speed_keys.update(speed_metrics.keys()) 

381 

382 for key in speed_keys: 

383 values = [r.get("speed_metrics", {}).get(key) for r in results] 

384 values = [v for v in values if v is not None] 

385 if values: 385 ↛ 382line 385 didn't jump to line 382 because the condition on line 385 was always true

386 avg_metrics["speed_metrics"][key] = sum(values) / len(values) 

387 

388 # Resource metrics 

389 resource_keys = set() 

390 for result in results: 

391 resource_metrics = result.get("resource_metrics", {}) 

392 resource_keys.update(resource_metrics.keys()) 

393 

394 for key in resource_keys: 

395 values = [r.get("resource_metrics", {}).get(key) for r in results] 

396 values = [v for v in values if v is not None] 

397 if values: 397 ↛ 394line 397 didn't jump to line 394 because the condition on line 397 was always true

398 avg_metrics["resource_metrics"][key] = sum(values) / len(values) 

399 

400 return avg_metrics 

401 

402 

403def _create_comparison_visualizations( 

404 comparison_report: Dict[str, Any], output_dir: str, timestamp: str 

405): 

406 """ 

407 Create visualizations for the comparison results. 

408 

409 Args: 

410 comparison_report: Comparison report dictionary 

411 output_dir: Directory to save visualizations 

412 timestamp: Timestamp string for filenames 

413 """ 

414 # Check if there are successful results 

415 successful_results = [ 

416 r 

417 for r in comparison_report.get("results", []) 

418 if r.get("success", False) 

419 ] 

420 

421 if not successful_results: 

422 logger.warning("No successful configurations to visualize") 

423 return 

424 

425 # Extract configuration names 

426 config_names = [ 

427 r.get("name", f"Config {i + 1}") 

428 for i, r in enumerate(successful_results) 

429 ] 

430 

431 # 1. Overall score comparison 

432 plt.figure(figsize=(12, 6)) 

433 scores = [r.get("overall_score", 0) for r in successful_results] 

434 

435 # Create horizontal bar chart 

436 plt.barh(config_names, scores, color="skyblue") 

437 plt.xlabel("Overall Score") 

438 plt.ylabel("Configuration") 

439 plt.title("Configuration Performance Comparison") 

440 plt.grid(axis="x", linestyle="--", alpha=0.7) 

441 plt.tight_layout() 

442 plt.savefig( 

443 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png") 

444 ) 

445 plt.close() 

446 

447 # 2. Quality metrics comparison 

448 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"] 

449 _create_metric_comparison_chart( 

450 successful_results, 

451 config_names, 

452 quality_metrics, 

453 "quality_metrics", 

454 "Quality Metrics Comparison", 

455 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"), 

456 ) 

457 

458 # 3. Speed metrics comparison 

459 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"] 

460 _create_metric_comparison_chart( 

461 successful_results, 

462 config_names, 

463 speed_metrics, 

464 "speed_metrics", 

465 "Speed Metrics Comparison", 

466 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"), 

467 ) 

468 

469 # 4. Resource metrics comparison 

470 resource_metrics = [ 

471 "overall_resource", 

472 "process_memory_max_mb", 

473 "system_cpu_avg", 

474 ] 

475 _create_metric_comparison_chart( 

476 successful_results, 

477 config_names, 

478 resource_metrics, 

479 "resource_metrics", 

480 "Resource Usage Comparison", 

481 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"), 

482 ) 

483 

484 # 5. Spider chart for multi-dimensional comparison 

485 _create_spider_chart( 

486 successful_results, 

487 config_names, 

488 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"), 

489 ) 

490 

491 # 6. Pareto frontier chart for quality vs. speed 

492 _create_pareto_chart( 

493 successful_results, 

494 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"), 

495 ) 

496 

497 

498def _create_metric_comparison_chart( 

499 results: List[Dict[str, Any]], 

500 config_names: List[str], 

501 metric_keys: List[str], 

502 metric_category: str, 

503 title: str, 

504 output_path: str, 

505): 

506 """ 

507 Create a chart comparing specific metrics across configurations. 

508 

509 Args: 

510 results: List of configuration results 

511 config_names: Names of configurations 

512 metric_keys: Keys of metrics to compare 

513 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.) 

514 title: Chart title 

515 output_path: Path to save the chart 

516 """ 

517 # Create figure with multiple subplots (one per metric) 

518 fig, axes = plt.subplots( 

519 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys)) 

520 ) 

521 

522 # Handle case with only one metric 

523 if len(metric_keys) == 1: 

524 axes = [axes] 

525 

526 for i, metric_key in enumerate(metric_keys): 

527 ax = axes[i] 

528 

529 # Get metric values 

530 metric_values = [] 

531 for result in results: 

532 metrics = result.get("avg_metrics", {}).get(metric_category, {}) 

533 value = metrics.get(metric_key) 

534 

535 # Handle time values for better visualization 

536 if "duration" in metric_key and value is not None: 

537 # Convert to seconds if > 60 seconds, minutes if > 60 minutes 

538 if value > 3600: 

539 value = value / 3600 # Convert to hours 

540 metric_key += " (hours)" 

541 elif value > 60: 

542 value = value / 60 # Convert to minutes 

543 metric_key += " (minutes)" 

544 else: 

545 metric_key += " (seconds)" 

546 

547 metric_values.append(value if value is not None else 0) 

548 

549 # Create horizontal bar chart 

550 bars = ax.barh(config_names, metric_values, color="lightblue") 

551 ax.set_xlabel(metric_key.replace("_", " ").title()) 

552 ax.set_title(f"{metric_key.replace('_', ' ').title()}") 

553 ax.grid(axis="x", linestyle="--", alpha=0.7) 

554 

555 # Add value labels to bars 

556 for bar in bars: 

557 width = bar.get_width() 

558 label_x_pos = width * 1.01 

559 ax.text( 

560 label_x_pos, 

561 bar.get_y() + bar.get_height() / 2, 

562 f"{width:.2f}", 

563 va="center", 

564 ) 

565 

566 plt.suptitle(title, fontsize=16) 

567 plt.tight_layout() 

568 plt.savefig(output_path) 

569 plt.close() 

570 

571 

572def _create_spider_chart( 

573 results: List[Dict[str, Any]], config_names: List[str], output_path: str 

574): 

575 """ 

576 Create a spider chart comparing metrics across configurations. 

577 

578 Args: 

579 results: List of configuration results 

580 config_names: Names of configurations 

581 output_path: Path to save the chart 

582 """ 

583 # Try to import the radar chart module 

584 try: 

585 from matplotlib.path import Path 

586 from matplotlib.projections import register_projection 

587 from matplotlib.projections.polar import PolarAxes 

588 from matplotlib.spines import Spine 

589 

590 def radar_factory(num_vars, frame="circle"): 

591 """Create a radar chart with `num_vars` axes.""" 

592 # Calculate evenly-spaced axis angles 

593 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) 

594 

595 class RadarAxes(PolarAxes): 

596 name = "radar" 

597 

598 def __init__(self, *args, **kwargs): 

599 super().__init__(*args, **kwargs) 

600 self.set_theta_zero_location("N") 

601 

602 def fill(self, *args, closed=True, **kwargs): 

603 return super().fill(closed=closed, *args, **kwargs) 

604 

605 def plot(self, *args, **kwargs): 

606 return super().plot(*args, **kwargs) 

607 

608 def set_varlabels(self, labels): 

609 self.set_thetagrids(np.degrees(theta), labels) 

610 

611 def _gen_axes_patch(self): 

612 if frame == "circle": 

613 return Circle((0.5, 0.5), 0.5) 

614 if frame == "polygon": 

615 return RegularPolygon( 

616 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k" 

617 ) 

618 raise ValueError("Unknown value for 'frame': %s" % frame) # noqa: TRY301 — inside nested method definition, not caught by enclosing try 

619 

620 def _gen_axes_spines(self): # type: ignore[misc] 

621 if frame == "circle": 

622 return super()._gen_axes_spines() 

623 if frame == "polygon": 

624 spine_type = Spine.circular_spine 

625 verts = unit_poly_verts(num_vars) 

626 vertices = [(0.5, 0.5)] + verts 

627 codes = ( 

628 [Path.MOVETO] 

629 + [Path.LINETO] * num_vars 

630 + [Path.CLOSEPOLY] 

631 ) 

632 path = Path(vertices, codes) 

633 spine = Spine(self, spine_type, path) # type: ignore[arg-type] 

634 spine.set_transform(self.transAxes) 

635 return {"polar": spine} 

636 raise ValueError("Unknown value for 'frame': %s" % frame) # noqa: TRY301 — inside nested method definition, not caught by enclosing try 

637 

638 def unit_poly_verts(num_vars): 

639 """Return vertices of polygon for radar chart.""" 

640 verts = [] 

641 for i in range(num_vars): 

642 angle = theta[i] 

643 verts.append( 

644 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle))) 

645 ) 

646 return verts 

647 

648 register_projection(RadarAxes) 

649 return theta 

650 

651 # Select metrics for the spider chart 

652 metrics = [ 

653 {"name": "Quality", "key": "quality_metrics.overall_quality"}, 

654 {"name": "Speed", "key": "speed_metrics.overall_speed"}, 

655 { 

656 "name": "Sources", 

657 "key": "quality_metrics.normalized_source_count", 

658 }, 

659 { 

660 "name": "Content", 

661 "key": "quality_metrics.normalized_knowledge_length", 

662 }, 

663 { 

664 "name": "Memory", 

665 "key": "resource_metrics.normalized_memory_usage", 

666 "invert": True, 

667 }, 

668 ] 

669 

670 # Extract metric values 

671 spoke_labels = [m["name"] for m in metrics] 

672 num_vars = len(spoke_labels) 

673 theta = radar_factory(num_vars) 

674 

675 fig, ax = plt.subplots( 

676 figsize=(10, 10), subplot_kw={"projection": "radar"} 

677 ) 

678 

679 # Color map for different configurations 

680 colors = plt.cm.viridis(np.linspace(0, 1, len(results))) # type: ignore[attr-defined] 

681 

682 for i, result in enumerate(results): 

683 values = [] 

684 for metric in metrics: 

685 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality") 

686 key_parts = metric["key"].split(".") 

687 value: Any = result.get("avg_metrics", {}) 

688 for part in key_parts: 

689 value = value.get(part, 0) if isinstance(value, dict) else 0 

690 

691 # Invert if needed (for metrics where lower is better) 

692 if metric.get("invert", False): 

693 value = 1.0 - value 

694 

695 values.append(value) 

696 

697 # Plot this configuration 

698 ax.plot( 

699 theta, 

700 values, 

701 color=colors[i], 

702 linewidth=2, 

703 label=config_names[i], 

704 ) 

705 ax.fill(theta, values, color=colors[i], alpha=0.25) 

706 

707 # Set chart properties 

708 ax.set_varlabels(spoke_labels) # type: ignore[attr-defined] 

709 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1)) 

710 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05) 

711 plt.tight_layout() 

712 

713 # Save chart 

714 plt.savefig(output_path) 

715 plt.close() 

716 

717 except Exception as e: 

718 logger.exception("Error creating spider chart") 

719 # Create a text-based chart as fallback 

720 plt.figure(figsize=(10, 6)) 

721 plt.text( 

722 0.5, 

723 0.5, 

724 f"Spider chart could not be created: {e!s}", 

725 horizontalalignment="center", 

726 verticalalignment="center", 

727 ) 

728 plt.axis("off") 

729 plt.savefig(output_path) 

730 plt.close() 

731 

732 

733def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str): 

734 """ 

735 Create a Pareto frontier chart showing quality vs. speed tradeoff. 

736 

737 Args: 

738 results: List of configuration results 

739 output_path: Path to save the chart 

740 """ 

741 # Extract quality and speed metrics 

742 quality_scores = [] 

743 speed_scores = [] 

744 names = [] 

745 

746 for result in results: 

747 metrics = result.get("avg_metrics", {}) 

748 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0) 

749 

750 # For speed, we use inverse of duration (so higher is better) 

751 duration = metrics.get("speed_metrics", {}).get("total_duration", 1) 

752 speed = 1.0 / max(duration, 0.001) # Avoid division by zero 

753 

754 quality_scores.append(quality) 

755 speed_scores.append(speed) 

756 names.append(result.get("name", "Configuration")) 

757 

758 # Create scatter plot 

759 plt.figure(figsize=(10, 8)) 

760 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7) 

761 

762 # Add labels for each point 

763 for i, name in enumerate(names): 

764 plt.annotate( 

765 name, 

766 (quality_scores[i], speed_scores[i]), 

767 xytext=(5, 5), 

768 textcoords="offset points", 

769 ) 

770 

771 # Identify Pareto frontier 

772 pareto_points = [] 

773 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)): 

774 is_pareto = True 

775 for q2, s2 in zip(quality_scores, speed_scores, strict=False): 

776 if q2 > q and s2 > s: # Dominated 

777 is_pareto = False 

778 break 

779 if is_pareto: 

780 pareto_points.append(i) 

781 

782 # Highlight Pareto frontier 

783 pareto_quality = [quality_scores[i] for i in pareto_points] 

784 pareto_speed = [speed_scores[i] for i in pareto_points] 

785 

786 # Sort pareto points for line drawing 

787 pareto_sorted = sorted( 

788 zip(pareto_quality, pareto_speed, pareto_points, strict=False) 

789 ) 

790 pareto_quality = [p[0] for p in pareto_sorted] 

791 pareto_speed = [p[1] for p in pareto_sorted] 

792 pareto_indices = [p[2] for p in pareto_sorted] 

793 

794 # Draw Pareto frontier line 

795 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2) 

796 

797 # Highlight Pareto optimal points 

798 plt.scatter( 

799 [quality_scores[i] for i in pareto_indices], 

800 [speed_scores[i] for i in pareto_indices], 

801 s=150, 

802 facecolors="none", 

803 edgecolors="r", 

804 linewidth=2, 

805 ) 

806 

807 # Add labels for Pareto optimal configurations 

808 for i in pareto_indices: 

809 plt.annotate( 

810 names[i], 

811 (quality_scores[i], speed_scores[i]), 

812 xytext=(8, 8), 

813 textcoords="offset points", 

814 bbox={"boxstyle": "round,pad=0.5", "fc": "yellow", "alpha": 0.7}, 

815 ) 

816 

817 # Set chart properties 

818 plt.xlabel("Quality Score (higher is better)") 

819 plt.ylabel("Speed Score (higher is better)") 

820 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14) 

821 plt.grid(True, linestyle="--", alpha=0.7) 

822 

823 # Add explanation 

824 plt.figtext( 

825 0.5, 

826 0.01, 

827 "Points on the red line are Pareto optimal configurations\n" 

828 "(no other configuration is better in both quality and speed)", 

829 ha="center", 

830 fontsize=10, 

831 bbox={"boxstyle": "round", "fc": "white", "alpha": 0.7}, 

832 ) 

833 

834 plt.tight_layout() 

835 plt.savefig(output_path) 

836 plt.close()