Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 84%

301 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Configuration comparison for Local Deep Research. 

3 

4This module provides functions for comparing different parameter configurations 

5and evaluating their performance across various metrics. 

6""" 

7 

8import os 

9from datetime import datetime, UTC 

10from pathlib import Path 

11from typing import Any, Dict, List, Optional 

12 

13import matplotlib.pyplot as plt 

14import numpy as np 

15from loguru import logger 

16from matplotlib.patches import Circle, RegularPolygon 

17 

18from local_deep_research.benchmarks.efficiency.resource_monitor import ( 

19 ResourceMonitor, 

20) 

21from local_deep_research.benchmarks.efficiency.speed_profiler import ( 

22 SpeedProfiler, 

23) 

24from local_deep_research.benchmarks.optimization.metrics import ( 

25 calculate_combined_score, 

26 calculate_quality_metrics, 

27 calculate_resource_metrics, 

28 calculate_speed_metrics, 

29) 

30from local_deep_research.config.llm_config import get_llm 

31from local_deep_research.config.search_config import get_search 

32from local_deep_research.search_system import AdvancedSearchSystem 

33 

34 

35def compare_configurations( 

36 query: str, 

37 configurations: List[Dict[str, Any]], 

38 output_dir: str = "comparison_results", 

39 model_name: Optional[str] = None, 

40 provider: Optional[str] = None, 

41 search_tool: Optional[str] = None, 

42 repetitions: int = 1, 

43 metric_weights: Optional[Dict[str, float]] = None, 

44) -> Dict[str, Any]: 

45 """ 

46 Compare multiple parameter configurations. 

47 

48 Args: 

49 query: Research query to use for evaluation 

50 configurations: List of parameter configurations to compare 

51 output_dir: Directory to save comparison results 

52 model_name: Name of the LLM model to use 

53 provider: LLM provider 

54 search_tool: Search engine to use 

55 repetitions: Number of repetitions for each configuration 

56 metric_weights: Dictionary of weights for each metric type 

57 

58 Returns: 

59 Dictionary with comparison results 

60 """ 

61 os.makedirs(output_dir, exist_ok=True) 

62 

63 # Default metric weights if not provided 

64 if metric_weights is None: 

65 metric_weights = { 

66 "quality": 0.6, 

67 "speed": 0.4, 

68 "resource": 0.0, # Disabled by default 

69 } 

70 

71 # Verify valid configurations 

72 if not configurations: 

73 logger.error("No configurations provided for comparison") 

74 return {"error": "No configurations provided"} 

75 

76 # Results storage 

77 results = [] 

78 

79 # Process each configuration 

80 for i, config in enumerate(configurations): 

81 logger.info( 

82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}" 

83 ) 

84 

85 # Name for this configuration 

86 config_name = config.get("name", f"Configuration {i + 1}") 

87 

88 # Results for all repetitions of this configuration 

89 config_results = [] 

90 

91 # Run multiple repetitions 

92 for rep in range(repetitions): 

93 logger.info( 

94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}" 

95 ) 

96 

97 try: 

98 # Run the configuration 

99 result = _evaluate_single_configuration( 

100 query=query, 

101 config=config, 

102 model_name=model_name, 

103 provider=provider, 

104 search_tool=search_tool, 

105 ) 

106 

107 config_results.append(result) 

108 logger.info(f"Completed repetition {rep + 1} for {config_name}") 

109 

110 except Exception as e: 

111 logger.exception( 

112 f"Error in {config_name}, repetition {rep + 1}" 

113 ) 

114 # Add error info but continue with other configurations 

115 config_results.append({"error": str(e), "success": False}) 

116 

117 # Calculate aggregate metrics across repetitions 

118 if config_results: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true

119 # Filter out failed runs 

120 successful_runs = [ 

121 r for r in config_results if r.get("success", False) 

122 ] 

123 

124 if successful_runs: 

125 # Calculate average metrics 

126 avg_metrics = _calculate_average_metrics(successful_runs) 

127 

128 # Calculate overall score 

129 overall_score = calculate_combined_score( 

130 metrics={ 

131 "quality": avg_metrics.get("quality_metrics", {}), 

132 "speed": avg_metrics.get("speed_metrics", {}), 

133 "resource": avg_metrics.get("resource_metrics", {}), 

134 }, 

135 weights=metric_weights, 

136 ) 

137 

138 result_summary = { 

139 "name": config_name, 

140 "configuration": config, 

141 "success": True, 

142 "runs_completed": len(successful_runs), 

143 "runs_failed": len(config_results) - len(successful_runs), 

144 "avg_metrics": avg_metrics, 

145 "overall_score": overall_score, 

146 "individual_results": config_results, 

147 } 

148 else: 

149 # All runs failed 

150 result_summary = { 

151 "name": config_name, 

152 "configuration": config, 

153 "success": False, 

154 "runs_completed": 0, 

155 "runs_failed": len(config_results), 

156 "error": "All runs failed", 

157 "individual_results": config_results, 

158 } 

159 

160 results.append(result_summary) 

161 

162 # Sort results by overall score (if available) 

163 sorted_results = sorted( 

164 [r for r in results if r.get("success", False)], 

165 key=lambda x: x.get("overall_score", 0), 

166 reverse=True, 

167 ) 

168 

169 # Add failed configurations at the end 

170 sorted_results.extend([r for r in results if not r.get("success", False)]) 

171 

172 # Create comparison report 

173 comparison_report = { 

174 "query": query, 

175 "configurations_tested": len(configurations), 

176 "successful_configurations": len( 

177 [r for r in results if r.get("success", False)] 

178 ), 

179 "failed_configurations": len( 

180 [r for r in results if not r.get("success", False)] 

181 ), 

182 "repetitions": repetitions, 

183 "metric_weights": metric_weights, 

184 "timestamp": datetime.now(UTC).isoformat(), 

185 "results": sorted_results, 

186 } 

187 

188 # Save results to file 

189 from ...security.file_write_verifier import write_json_verified 

190 

191 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

192 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json") 

193 

194 write_json_verified( 

195 result_file, 

196 comparison_report, 

197 "benchmark.allow_file_output", 

198 context="comparison results", 

199 ) 

200 

201 # Generate visualizations 

202 visualizations_dir = Path(output_dir) / "visualizations" 

203 visualizations_dir.mkdir(parents=True, exist_ok=True) 

204 visualizations_dir = str(visualizations_dir) 

205 

206 _create_comparison_visualizations( 

207 comparison_report, output_dir=visualizations_dir, timestamp=timestamp 

208 ) 

209 

210 logger.info(f"Comparison completed. Results saved to {result_file}") 

211 

212 # Add report path to the result 

213 comparison_report["report_path"] = result_file 

214 

215 return comparison_report 

216 

217 

218def _evaluate_single_configuration( 

219 query: str, 

220 config: Dict[str, Any], 

221 model_name: Optional[str] = None, 

222 provider: Optional[str] = None, 

223 search_tool: Optional[str] = None, 

224) -> Dict[str, Any]: 

225 """ 

226 Evaluate a single configuration. 

227 

228 Args: 

229 query: Research query to evaluate 

230 config: Configuration parameters 

231 model_name: Name of the LLM model to use 

232 provider: LLM provider 

233 search_tool: Search engine to use 

234 

235 Returns: 

236 Dictionary with evaluation results 

237 """ 

238 # Extract configuration parameters 

239 config_model_name = config.get("model_name", model_name) 

240 config_provider = config.get("provider", provider) 

241 config_search_tool = config.get("search_tool", search_tool) 

242 config_iterations = config.get("iterations", 2) 

243 config_questions_per_iteration = config.get("questions_per_iteration", 2) 

244 config_search_strategy = config.get("search_strategy", "iterdrag") 

245 config_max_results = config.get("max_results", 50) 

246 config_max_filtered_results = config.get("max_filtered_results", 20) 

247 

248 # Initialize profiling tools 

249 speed_profiler = SpeedProfiler() 

250 resource_monitor = ResourceMonitor(sampling_interval=0.5) 

251 

252 # Start profiling 

253 speed_profiler.start() 

254 resource_monitor.start() 

255 

256 try: 

257 # Get LLM 

258 with speed_profiler.timer("llm_initialization"): 

259 llm = get_llm( 

260 temperature=config.get("temperature", 0.7), 

261 model_name=config_model_name, 

262 provider=config_provider, 

263 ) 

264 

265 # Set up search engine if specified 

266 with speed_profiler.timer("search_initialization"): 

267 search = None 

268 if config_search_tool: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 search = get_search( 

270 config_search_tool, 

271 llm_instance=llm, 

272 max_results=config_max_results, 

273 max_filtered_results=config_max_filtered_results, 

274 ) 

275 

276 # Create search system 

277 system = AdvancedSearchSystem(llm=llm, search=search) 

278 system.max_iterations = config_iterations 

279 system.questions_per_iteration = config_questions_per_iteration 

280 system.strategy_name = config_search_strategy 

281 

282 # Run the analysis 

283 with speed_profiler.timer("analysis"): 

284 results = system.analyze_topic(query) 

285 

286 # Stop profiling 

287 speed_profiler.stop() 

288 resource_monitor.stop() 

289 

290 # Calculate metrics 

291 quality_metrics = calculate_quality_metrics( 

292 results=results, 

293 system_info={ 

294 "all_links_of_system": getattr( 

295 system, "all_links_of_system", [] 

296 ) 

297 }, 

298 ) 

299 

300 speed_metrics = calculate_speed_metrics( 

301 timing_info=speed_profiler.get_summary(), 

302 system_info={ 

303 "iterations": config_iterations, 

304 "questions_per_iteration": config_questions_per_iteration, 

305 "results": results, 

306 }, 

307 ) 

308 

309 resource_metrics = calculate_resource_metrics( 

310 resource_info=resource_monitor.get_combined_stats(), 

311 system_info={ 

312 "iterations": config_iterations, 

313 "questions_per_iteration": config_questions_per_iteration, 

314 "results": results, 

315 }, 

316 ) 

317 

318 # Return comprehensive results 

319 return { 

320 "query": query, 

321 "config": config, 

322 "success": True, 

323 "findings_count": len(results.get("findings", [])), 

324 "knowledge_length": len(results.get("current_knowledge", "")), 

325 "quality_metrics": quality_metrics, 

326 "speed_metrics": speed_metrics, 

327 "resource_metrics": resource_metrics, 

328 "timing_details": speed_profiler.get_timings(), 

329 "resource_details": resource_monitor.get_combined_stats(), 

330 } 

331 

332 except Exception as e: 

333 # Stop profiling on error 

334 speed_profiler.stop() 

335 resource_monitor.stop() 

336 

337 # Log the error 

338 logger.exception("Error evaluating configuration") 

339 

340 # Return error information 

341 return { 

342 "query": query, 

343 "config": config, 

344 "success": False, 

345 "error": str(e), 

346 "timing_details": speed_profiler.get_timings(), 

347 "resource_details": resource_monitor.get_combined_stats(), 

348 } 

349 

350 

351def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]: 

352 """ 

353 Calculate average metrics across multiple runs. 

354 

355 Args: 

356 results: List of individual run results 

357 

358 Returns: 

359 Dictionary with averaged metrics 

360 """ 

361 # Check if there are any successful results 

362 if not results: 

363 return {} 

364 

365 # Initialize average metrics 

366 avg_metrics = { 

367 "quality_metrics": {}, 

368 "speed_metrics": {}, 

369 "resource_metrics": {}, 

370 } 

371 

372 # Quality metrics 

373 quality_keys = set() 

374 for result in results: 

375 quality_metrics = result.get("quality_metrics", {}) 

376 quality_keys.update(quality_metrics.keys()) 

377 

378 for key in quality_keys: 

379 values = [r.get("quality_metrics", {}).get(key) for r in results] 

380 values = [v for v in values if v is not None] 

381 if values: 381 ↛ 378line 381 didn't jump to line 378 because the condition on line 381 was always true

382 avg_metrics["quality_metrics"][key] = sum(values) / len(values) 

383 

384 # Speed metrics 

385 speed_keys = set() 

386 for result in results: 

387 speed_metrics = result.get("speed_metrics", {}) 

388 speed_keys.update(speed_metrics.keys()) 

389 

390 for key in speed_keys: 

391 values = [r.get("speed_metrics", {}).get(key) for r in results] 

392 values = [v for v in values if v is not None] 

393 if values: 393 ↛ 390line 393 didn't jump to line 390 because the condition on line 393 was always true

394 avg_metrics["speed_metrics"][key] = sum(values) / len(values) 

395 

396 # Resource metrics 

397 resource_keys = set() 

398 for result in results: 

399 resource_metrics = result.get("resource_metrics", {}) 

400 resource_keys.update(resource_metrics.keys()) 

401 

402 for key in resource_keys: 

403 values = [r.get("resource_metrics", {}).get(key) for r in results] 

404 values = [v for v in values if v is not None] 

405 if values: 405 ↛ 402line 405 didn't jump to line 402 because the condition on line 405 was always true

406 avg_metrics["resource_metrics"][key] = sum(values) / len(values) 

407 

408 return avg_metrics 

409 

410 

411def _create_comparison_visualizations( 

412 comparison_report: Dict[str, Any], output_dir: str, timestamp: str 

413): 

414 """ 

415 Create visualizations for the comparison results. 

416 

417 Args: 

418 comparison_report: Comparison report dictionary 

419 output_dir: Directory to save visualizations 

420 timestamp: Timestamp string for filenames 

421 """ 

422 # Check if there are successful results 

423 successful_results = [ 

424 r 

425 for r in comparison_report.get("results", []) 

426 if r.get("success", False) 

427 ] 

428 

429 if not successful_results: 

430 logger.warning("No successful configurations to visualize") 

431 return 

432 

433 # Extract configuration names 

434 config_names = [ 

435 r.get("name", f"Config {i + 1}") 

436 for i, r in enumerate(successful_results) 

437 ] 

438 

439 # 1. Overall score comparison 

440 plt.figure(figsize=(12, 6)) 

441 scores = [r.get("overall_score", 0) for r in successful_results] 

442 

443 # Create horizontal bar chart 

444 plt.barh(config_names, scores, color="skyblue") 

445 plt.xlabel("Overall Score") 

446 plt.ylabel("Configuration") 

447 plt.title("Configuration Performance Comparison") 

448 plt.grid(axis="x", linestyle="--", alpha=0.7) 

449 plt.tight_layout() 

450 plt.savefig( 

451 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png") 

452 ) 

453 plt.close() 

454 

455 # 2. Quality metrics comparison 

456 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"] 

457 _create_metric_comparison_chart( 

458 successful_results, 

459 config_names, 

460 quality_metrics, 

461 "quality_metrics", 

462 "Quality Metrics Comparison", 

463 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"), 

464 ) 

465 

466 # 3. Speed metrics comparison 

467 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"] 

468 _create_metric_comparison_chart( 

469 successful_results, 

470 config_names, 

471 speed_metrics, 

472 "speed_metrics", 

473 "Speed Metrics Comparison", 

474 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"), 

475 ) 

476 

477 # 4. Resource metrics comparison 

478 resource_metrics = [ 

479 "overall_resource", 

480 "process_memory_max_mb", 

481 "system_cpu_avg", 

482 ] 

483 _create_metric_comparison_chart( 

484 successful_results, 

485 config_names, 

486 resource_metrics, 

487 "resource_metrics", 

488 "Resource Usage Comparison", 

489 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"), 

490 ) 

491 

492 # 5. Spider chart for multi-dimensional comparison 

493 _create_spider_chart( 

494 successful_results, 

495 config_names, 

496 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"), 

497 ) 

498 

499 # 6. Pareto frontier chart for quality vs. speed 

500 _create_pareto_chart( 

501 successful_results, 

502 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"), 

503 ) 

504 

505 

506def _create_metric_comparison_chart( 

507 results: List[Dict[str, Any]], 

508 config_names: List[str], 

509 metric_keys: List[str], 

510 metric_category: str, 

511 title: str, 

512 output_path: str, 

513): 

514 """ 

515 Create a chart comparing specific metrics across configurations. 

516 

517 Args: 

518 results: List of configuration results 

519 config_names: Names of configurations 

520 metric_keys: Keys of metrics to compare 

521 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.) 

522 title: Chart title 

523 output_path: Path to save the chart 

524 """ 

525 # Create figure with multiple subplots (one per metric) 

526 fig, axes = plt.subplots( 

527 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys)) 

528 ) 

529 

530 # Handle case with only one metric 

531 if len(metric_keys) == 1: 

532 axes = [axes] 

533 

534 for i, metric_key in enumerate(metric_keys): 

535 ax = axes[i] 

536 

537 # Get metric values 

538 metric_values = [] 

539 for result in results: 

540 metrics = result.get("avg_metrics", {}).get(metric_category, {}) 

541 value = metrics.get(metric_key) 

542 

543 # Handle time values for better visualization 

544 if "duration" in metric_key and value is not None: 

545 # Convert to seconds if > 60 seconds, minutes if > 60 minutes 

546 if value > 3600: 546 ↛ 547line 546 didn't jump to line 547 because the condition on line 546 was never true

547 value = value / 3600 # Convert to hours 

548 metric_key += " (hours)" 

549 elif value > 60: 549 ↛ 550line 549 didn't jump to line 550 because the condition on line 549 was never true

550 value = value / 60 # Convert to minutes 

551 metric_key += " (minutes)" 

552 else: 

553 metric_key += " (seconds)" 

554 

555 metric_values.append(value if value is not None else 0) 

556 

557 # Create horizontal bar chart 

558 bars = ax.barh(config_names, metric_values, color="lightblue") 

559 ax.set_xlabel(metric_key.replace("_", " ").title()) 

560 ax.set_title(f"{metric_key.replace('_', ' ').title()}") 

561 ax.grid(axis="x", linestyle="--", alpha=0.7) 

562 

563 # Add value labels to bars 

564 for bar in bars: 

565 width = bar.get_width() 

566 label_x_pos = width * 1.01 

567 ax.text( 

568 label_x_pos, 

569 bar.get_y() + bar.get_height() / 2, 

570 f"{width:.2f}", 

571 va="center", 

572 ) 

573 

574 plt.suptitle(title, fontsize=16) 

575 plt.tight_layout() 

576 plt.savefig(output_path) 

577 plt.close() 

578 

579 

580def _create_spider_chart( 

581 results: List[Dict[str, Any]], config_names: List[str], output_path: str 

582): 

583 """ 

584 Create a spider chart comparing metrics across configurations. 

585 

586 Args: 

587 results: List of configuration results 

588 config_names: Names of configurations 

589 output_path: Path to save the chart 

590 """ 

591 # Try to import the radar chart module 

592 try: 

593 from matplotlib.path import Path 

594 from matplotlib.projections import register_projection 

595 from matplotlib.projections.polar import PolarAxes 

596 from matplotlib.spines import Spine 

597 

598 def radar_factory(num_vars, frame="circle"): 

599 """Create a radar chart with `num_vars` axes.""" 

600 # Calculate evenly-spaced axis angles 

601 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False) 

602 

603 class RadarAxes(PolarAxes): 

604 name = "radar" 

605 

606 def __init__(self, *args, **kwargs): 

607 super().__init__(*args, **kwargs) 

608 self.set_theta_zero_location("N") 

609 

610 def fill(self, *args, closed=True, **kwargs): 

611 return super().fill(closed=closed, *args, **kwargs) 

612 

613 def plot(self, *args, **kwargs): 

614 return super().plot(*args, **kwargs) 

615 

616 def set_varlabels(self, labels): 

617 self.set_thetagrids(np.degrees(theta), labels) 

618 

619 def _gen_axes_patch(self): 

620 if frame == "circle": 

621 return Circle((0.5, 0.5), 0.5) 

622 elif frame == "polygon": 

623 return RegularPolygon( 

624 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k" 

625 ) 

626 else: 

627 raise ValueError( 

628 "Unknown value for 'frame': %s" % frame 

629 ) 

630 

631 def _gen_axes_spines(self): 

632 if frame == "circle": 

633 return super()._gen_axes_spines() 

634 elif frame == "polygon": 

635 spine_type = Spine.circular_spine 

636 verts = unit_poly_verts(num_vars) 

637 vertices = [(0.5, 0.5)] + verts 

638 codes = ( 

639 [Path.MOVETO] 

640 + [Path.LINETO] * num_vars 

641 + [Path.CLOSEPOLY] 

642 ) 

643 path = Path(vertices, codes) 

644 spine = Spine(self, spine_type, path) 

645 spine.set_transform(self.transAxes) 

646 return {"polar": spine} 

647 else: 

648 raise ValueError( 

649 "Unknown value for 'frame': %s" % frame 

650 ) 

651 

652 def unit_poly_verts(num_vars): 

653 """Return vertices of polygon for radar chart.""" 

654 verts = [] 

655 for i in range(num_vars): 

656 angle = theta[i] 

657 verts.append( 

658 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle))) 

659 ) 

660 return verts 

661 

662 register_projection(RadarAxes) 

663 return theta 

664 

665 # Select metrics for the spider chart 

666 metrics = [ 

667 {"name": "Quality", "key": "quality_metrics.overall_quality"}, 

668 {"name": "Speed", "key": "speed_metrics.overall_speed"}, 

669 { 

670 "name": "Sources", 

671 "key": "quality_metrics.normalized_source_count", 

672 }, 

673 { 

674 "name": "Content", 

675 "key": "quality_metrics.normalized_knowledge_length", 

676 }, 

677 { 

678 "name": "Memory", 

679 "key": "resource_metrics.normalized_memory_usage", 

680 "invert": True, 

681 }, 

682 ] 

683 

684 # Extract metric values 

685 spoke_labels = [m["name"] for m in metrics] 

686 num_vars = len(spoke_labels) 

687 theta = radar_factory(num_vars) 

688 

689 fig, ax = plt.subplots( 

690 figsize=(10, 10), subplot_kw=dict(projection="radar") 

691 ) 

692 

693 # Color map for different configurations 

694 colors = plt.cm.viridis(np.linspace(0, 1, len(results))) 

695 

696 for i, result in enumerate(results): 

697 values = [] 

698 for metric in metrics: 

699 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality") 

700 key_parts = metric["key"].split(".") 

701 value = result.get("avg_metrics", {}) 

702 for part in key_parts: 

703 value = value.get(part, 0) if isinstance(value, dict) else 0 

704 

705 # Invert if needed (for metrics where lower is better) 

706 if metric.get("invert", False): 

707 value = 1.0 - value 

708 

709 values.append(value) 

710 

711 # Plot this configuration 

712 ax.plot( 

713 theta, 

714 values, 

715 color=colors[i], 

716 linewidth=2, 

717 label=config_names[i], 

718 ) 

719 ax.fill(theta, values, color=colors[i], alpha=0.25) 

720 

721 # Set chart properties 

722 ax.set_varlabels(spoke_labels) 

723 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1)) 

724 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05) 

725 plt.tight_layout() 

726 

727 # Save chart 

728 plt.savefig(output_path) 

729 plt.close() 

730 

731 except Exception as e: 

732 logger.exception("Error creating spider chart") 

733 # Create a text-based chart as fallback 

734 plt.figure(figsize=(10, 6)) 

735 plt.text( 

736 0.5, 

737 0.5, 

738 f"Spider chart could not be created: {e!s}", 

739 horizontalalignment="center", 

740 verticalalignment="center", 

741 ) 

742 plt.axis("off") 

743 plt.savefig(output_path) 

744 plt.close() 

745 

746 

747def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str): 

748 """ 

749 Create a Pareto frontier chart showing quality vs. speed tradeoff. 

750 

751 Args: 

752 results: List of configuration results 

753 output_path: Path to save the chart 

754 """ 

755 # Extract quality and speed metrics 

756 quality_scores = [] 

757 speed_scores = [] 

758 names = [] 

759 

760 for result in results: 

761 metrics = result.get("avg_metrics", {}) 

762 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0) 

763 

764 # For speed, we use inverse of duration (so higher is better) 

765 duration = metrics.get("speed_metrics", {}).get("total_duration", 1) 

766 speed = 1.0 / max(duration, 0.001) # Avoid division by zero 

767 

768 quality_scores.append(quality) 

769 speed_scores.append(speed) 

770 names.append(result.get("name", "Configuration")) 

771 

772 # Create scatter plot 

773 plt.figure(figsize=(10, 8)) 

774 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7) 

775 

776 # Add labels for each point 

777 for i, name in enumerate(names): 

778 plt.annotate( 

779 name, 

780 (quality_scores[i], speed_scores[i]), 

781 xytext=(5, 5), 

782 textcoords="offset points", 

783 ) 

784 

785 # Identify Pareto frontier 

786 pareto_points = [] 

787 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)): 

788 is_pareto = True 

789 for q2, s2 in zip(quality_scores, speed_scores, strict=False): 

790 if q2 > q and s2 > s: # Dominated 790 ↛ 791line 790 didn't jump to line 791 because the condition on line 790 was never true

791 is_pareto = False 

792 break 

793 if is_pareto: 793 ↛ 787line 793 didn't jump to line 787 because the condition on line 793 was always true

794 pareto_points.append(i) 

795 

796 # Highlight Pareto frontier 

797 pareto_quality = [quality_scores[i] for i in pareto_points] 

798 pareto_speed = [speed_scores[i] for i in pareto_points] 

799 

800 # Sort pareto points for line drawing 

801 pareto_sorted = sorted( 

802 zip(pareto_quality, pareto_speed, pareto_points, strict=False) 

803 ) 

804 pareto_quality = [p[0] for p in pareto_sorted] 

805 pareto_speed = [p[1] for p in pareto_sorted] 

806 pareto_indices = [p[2] for p in pareto_sorted] 

807 

808 # Draw Pareto frontier line 

809 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2) 

810 

811 # Highlight Pareto optimal points 

812 plt.scatter( 

813 [quality_scores[i] for i in pareto_indices], 

814 [speed_scores[i] for i in pareto_indices], 

815 s=150, 

816 facecolors="none", 

817 edgecolors="r", 

818 linewidth=2, 

819 ) 

820 

821 # Add labels for Pareto optimal configurations 

822 for i in pareto_indices: 

823 plt.annotate( 

824 names[i], 

825 (quality_scores[i], speed_scores[i]), 

826 xytext=(8, 8), 

827 textcoords="offset points", 

828 bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.7), 

829 ) 

830 

831 # Set chart properties 

832 plt.xlabel("Quality Score (higher is better)") 

833 plt.ylabel("Speed Score (higher is better)") 

834 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14) 

835 plt.grid(True, linestyle="--", alpha=0.7) 

836 

837 # Add explanation 

838 plt.figtext( 

839 0.5, 

840 0.01, 

841 "Points on the red line are Pareto optimal configurations\n" 

842 "(no other configuration is better in both quality and speed)", 

843 ha="center", 

844 fontsize=10, 

845 bbox=dict(boxstyle="round", fc="white", alpha=0.7), 

846 ) 

847 

848 plt.tight_layout() 

849 plt.savefig(output_path) 

850 plt.close()