Coverage for src/local_deep_research/benchmarks/comparison/evaluator.py: 84%

1"""

2Configuration comparison for Local Deep Research.

4This module provides functions for comparing different parameter configurations

5and evaluating their performance across various metrics.

6"""

8import os

9from datetime import datetime, UTC

10from pathlib import Path

11from typing import Any, Dict, List, Optional

13import matplotlib.pyplot as plt

14import numpy as np

15from loguru import logger

16from matplotlib.patches import Circle, RegularPolygon

18from local_deep_research.benchmarks.efficiency.resource_monitor import (

19 ResourceMonitor,

20)

21from local_deep_research.benchmarks.efficiency.speed_profiler import (

22 SpeedProfiler,

23)

24from local_deep_research.benchmarks.optimization.metrics import (

25 calculate_combined_score,

26 calculate_quality_metrics,

27 calculate_resource_metrics,

28 calculate_speed_metrics,

29)

30from local_deep_research.config.llm_config import get_llm

31from local_deep_research.config.search_config import get_search

32from local_deep_research.search_system import AdvancedSearchSystem

35def compare_configurations(

36 query: str,

37 configurations: List[Dict[str, Any]],

38 output_dir: str = "comparison_results",

39 model_name: Optional[str] = None,

40 provider: Optional[str] = None,

41 search_tool: Optional[str] = None,

42 repetitions: int = 1,

43 metric_weights: Optional[Dict[str, float]] = None,

44) -> Dict[str, Any]:

45 """

46 Compare multiple parameter configurations.

48 Args:

49 query: Research query to use for evaluation

50 configurations: List of parameter configurations to compare

51 output_dir: Directory to save comparison results

52 model_name: Name of the LLM model to use

53 provider: LLM provider

54 search_tool: Search engine to use

55 repetitions: Number of repetitions for each configuration

56 metric_weights: Dictionary of weights for each metric type

58 Returns:

59 Dictionary with comparison results

60 """

61 os.makedirs(output_dir, exist_ok=True)

63 # Default metric weights if not provided

64 if metric_weights is None:

65 metric_weights = {

66 "quality": 0.6,

67 "speed": 0.4,

68 "resource": 0.0, # Disabled by default

69 }

71 # Verify valid configurations

72 if not configurations:

73 logger.error("No configurations provided for comparison")

74 return {"error": "No configurations provided"}

76 # Results storage

77 results = []

79 # Process each configuration

80 for i, config in enumerate(configurations):

81 logger.info(

82 f"Evaluating configuration {i + 1}/{len(configurations)}: {config}"

83 )

85 # Name for this configuration

86 config_name = config.get("name", f"Configuration {i + 1}")

88 # Results for all repetitions of this configuration

89 config_results = []

91 # Run multiple repetitions

92 for rep in range(repetitions):

93 logger.info(

94 f"Starting repetition {rep + 1}/{repetitions} for {config_name}"

95 )

97 try:

98 # Run the configuration

99 result = _evaluate_single_configuration(

100 query=query,

101 config=config,

102 model_name=model_name,

103 provider=provider,

104 search_tool=search_tool,

105 )

106

107 config_results.append(result)

108 logger.info(f"Completed repetition {rep + 1} for {config_name}")

109

110 except Exception as e:

111 logger.exception(

112 f"Error in {config_name}, repetition {rep + 1}"

113 )

114 # Add error info but continue with other configurations

115 config_results.append({"error": str(e), "success": False})

116

117 # Calculate aggregate metrics across repetitions

118 if config_results: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true

119 # Filter out failed runs

120 successful_runs = [

121 r for r in config_results if r.get("success", False)

122 ]

123

124 if successful_runs:

125 # Calculate average metrics

126 avg_metrics = _calculate_average_metrics(successful_runs)

127

128 # Calculate overall score

129 overall_score = calculate_combined_score(

130 metrics={

131 "quality": avg_metrics.get("quality_metrics", {}),

132 "speed": avg_metrics.get("speed_metrics", {}),

133 "resource": avg_metrics.get("resource_metrics", {}),

134 },

135 weights=metric_weights,

136 )

137

138 result_summary = {

139 "name": config_name,

140 "configuration": config,

141 "success": True,

142 "runs_completed": len(successful_runs),

143 "runs_failed": len(config_results) - len(successful_runs),

144 "avg_metrics": avg_metrics,

145 "overall_score": overall_score,

146 "individual_results": config_results,

147 }

148 else:

149 # All runs failed

150 result_summary = {

151 "name": config_name,

152 "configuration": config,

153 "success": False,

154 "runs_completed": 0,

155 "runs_failed": len(config_results),

156 "error": "All runs failed",

157 "individual_results": config_results,

158 }

159

160 results.append(result_summary)

161

162 # Sort results by overall score (if available)

163 sorted_results = sorted(

164 [r for r in results if r.get("success", False)],

165 key=lambda x: x.get("overall_score", 0),

166 reverse=True,

167 )

168

169 # Add failed configurations at the end

170 sorted_results.extend([r for r in results if not r.get("success", False)])

171

172 # Create comparison report

173 comparison_report = {

174 "query": query,

175 "configurations_tested": len(configurations),

176 "successful_configurations": len(

177 [r for r in results if r.get("success", False)]

178 ),

179 "failed_configurations": len(

180 [r for r in results if not r.get("success", False)]

181 ),

182 "repetitions": repetitions,

183 "metric_weights": metric_weights,

184 "timestamp": datetime.now(UTC).isoformat(),

185 "results": sorted_results,

186 }

187

188 # Save results to file

189 from ...security.file_write_verifier import write_json_verified

190

191 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

192 result_file = str(Path(output_dir) / f"comparison_results_{timestamp}.json")

193

194 write_json_verified(

195 result_file,

196 comparison_report,

197 "benchmark.allow_file_output",

198 context="comparison results",

199 )

200

201 # Generate visualizations

202 visualizations_dir = Path(output_dir) / "visualizations"

203 visualizations_dir.mkdir(parents=True, exist_ok=True)

204 visualizations_dir = str(visualizations_dir)

205

206 _create_comparison_visualizations(

207 comparison_report, output_dir=visualizations_dir, timestamp=timestamp

208 )

209

210 logger.info(f"Comparison completed. Results saved to {result_file}")

211

212 # Add report path to the result

213 comparison_report["report_path"] = result_file

214

215 return comparison_report

216

217

218def _evaluate_single_configuration(

219 query: str,

220 config: Dict[str, Any],

221 model_name: Optional[str] = None,

222 provider: Optional[str] = None,

223 search_tool: Optional[str] = None,

224) -> Dict[str, Any]:

225 """

226 Evaluate a single configuration.

227

228 Args:

229 query: Research query to evaluate

230 config: Configuration parameters

231 model_name: Name of the LLM model to use

232 provider: LLM provider

233 search_tool: Search engine to use

234

235 Returns:

236 Dictionary with evaluation results

237 """

238 # Extract configuration parameters

239 config_model_name = config.get("model_name", model_name)

240 config_provider = config.get("provider", provider)

241 config_search_tool = config.get("search_tool", search_tool)

242 config_iterations = config.get("iterations", 2)

243 config_questions_per_iteration = config.get("questions_per_iteration", 2)

244 config_search_strategy = config.get("search_strategy", "iterdrag")

245 config_max_results = config.get("max_results", 50)

246 config_max_filtered_results = config.get("max_filtered_results", 20)

247

248 # Initialize profiling tools

249 speed_profiler = SpeedProfiler()

250 resource_monitor = ResourceMonitor(sampling_interval=0.5)

251

252 # Start profiling

253 speed_profiler.start()

254 resource_monitor.start()

255

256 try:

257 # Get LLM

258 with speed_profiler.timer("llm_initialization"):

259 llm = get_llm(

260 temperature=config.get("temperature", 0.7),

261 model_name=config_model_name,

262 provider=config_provider,

263 )

264

265 # Set up search engine if specified

266 with speed_profiler.timer("search_initialization"):

267 search = None

268 if config_search_tool: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 search = get_search(

270 config_search_tool,

271 llm_instance=llm,

272 max_results=config_max_results,

273 max_filtered_results=config_max_filtered_results,

274 )

275

276 # Create search system

277 system = AdvancedSearchSystem(llm=llm, search=search)

278 system.max_iterations = config_iterations

279 system.questions_per_iteration = config_questions_per_iteration

280 system.strategy_name = config_search_strategy

281

282 # Run the analysis

283 with speed_profiler.timer("analysis"):

284 results = system.analyze_topic(query)

285

286 # Stop profiling

287 speed_profiler.stop()

288 resource_monitor.stop()

289

290 # Calculate metrics

291 quality_metrics = calculate_quality_metrics(

292 results=results,

293 system_info={

294 "all_links_of_system": getattr(

295 system, "all_links_of_system", []

296 )

297 },

298 )

299

300 speed_metrics = calculate_speed_metrics(

301 timing_info=speed_profiler.get_summary(),

302 system_info={

303 "iterations": config_iterations,

304 "questions_per_iteration": config_questions_per_iteration,

305 "results": results,

306 },

307 )

308

309 resource_metrics = calculate_resource_metrics(

310 resource_info=resource_monitor.get_combined_stats(),

311 system_info={

312 "iterations": config_iterations,

313 "questions_per_iteration": config_questions_per_iteration,

314 "results": results,

315 },

316 )

317

318 # Return comprehensive results

319 return {

320 "query": query,

321 "config": config,

322 "success": True,

323 "findings_count": len(results.get("findings", [])),

324 "knowledge_length": len(results.get("current_knowledge", "")),

325 "quality_metrics": quality_metrics,

326 "speed_metrics": speed_metrics,

327 "resource_metrics": resource_metrics,

328 "timing_details": speed_profiler.get_timings(),

329 "resource_details": resource_monitor.get_combined_stats(),

330 }

331

332 except Exception as e:

333 # Stop profiling on error

334 speed_profiler.stop()

335 resource_monitor.stop()

336

337 # Log the error

338 logger.exception("Error evaluating configuration")

339

340 # Return error information

341 return {

342 "query": query,

343 "config": config,

344 "success": False,

345 "error": str(e),

346 "timing_details": speed_profiler.get_timings(),

347 "resource_details": resource_monitor.get_combined_stats(),

348 }

349

350

351def _calculate_average_metrics(results: List[Dict[str, Any]]) -> Dict[str, Any]:

352 """

353 Calculate average metrics across multiple runs.

354

355 Args:

356 results: List of individual run results

357

358 Returns:

359 Dictionary with averaged metrics

360 """

361 # Check if there are any successful results

362 if not results:

363 return {}

364

365 # Initialize average metrics

366 avg_metrics = {

367 "quality_metrics": {},

368 "speed_metrics": {},

369 "resource_metrics": {},

370 }

371

372 # Quality metrics

373 quality_keys = set()

374 for result in results:

375 quality_metrics = result.get("quality_metrics", {})

376 quality_keys.update(quality_metrics.keys())

377

378 for key in quality_keys:

379 values = [r.get("quality_metrics", {}).get(key) for r in results]

380 values = [v for v in values if v is not None]

381 if values: 381 ↛ 378line 381 didn't jump to line 378 because the condition on line 381 was always true

382 avg_metrics["quality_metrics"][key] = sum(values) / len(values)

383

384 # Speed metrics

385 speed_keys = set()

386 for result in results:

387 speed_metrics = result.get("speed_metrics", {})

388 speed_keys.update(speed_metrics.keys())

389

390 for key in speed_keys:

391 values = [r.get("speed_metrics", {}).get(key) for r in results]

392 values = [v for v in values if v is not None]

393 if values: 393 ↛ 390line 393 didn't jump to line 390 because the condition on line 393 was always true

394 avg_metrics["speed_metrics"][key] = sum(values) / len(values)

395

396 # Resource metrics

397 resource_keys = set()

398 for result in results:

399 resource_metrics = result.get("resource_metrics", {})

400 resource_keys.update(resource_metrics.keys())

401

402 for key in resource_keys:

403 values = [r.get("resource_metrics", {}).get(key) for r in results]

404 values = [v for v in values if v is not None]

405 if values: 405 ↛ 402line 405 didn't jump to line 402 because the condition on line 405 was always true

406 avg_metrics["resource_metrics"][key] = sum(values) / len(values)

407

408 return avg_metrics

409

410

411def _create_comparison_visualizations(

412 comparison_report: Dict[str, Any], output_dir: str, timestamp: str

413):

414 """

415 Create visualizations for the comparison results.

416

417 Args:

418 comparison_report: Comparison report dictionary

419 output_dir: Directory to save visualizations

420 timestamp: Timestamp string for filenames

421 """

422 # Check if there are successful results

423 successful_results = [

424 r

425 for r in comparison_report.get("results", [])

426 if r.get("success", False)

427 ]

428

429 if not successful_results:

430 logger.warning("No successful configurations to visualize")

431 return

432

433 # Extract configuration names

434 config_names = [

435 r.get("name", f"Config {i + 1}")

436 for i, r in enumerate(successful_results)

437 ]

438

439 # 1. Overall score comparison

440 plt.figure(figsize=(12, 6))

441 scores = [r.get("overall_score", 0) for r in successful_results]

442

443 # Create horizontal bar chart

444 plt.barh(config_names, scores, color="skyblue")

445 plt.xlabel("Overall Score")

446 plt.ylabel("Configuration")

447 plt.title("Configuration Performance Comparison")

448 plt.grid(axis="x", linestyle="--", alpha=0.7)

449 plt.tight_layout()

450 plt.savefig(

451 str(Path(output_dir) / f"overall_score_comparison_{timestamp}.png")

452 )

453 plt.close()

454

455 # 2. Quality metrics comparison

456 quality_metrics = ["overall_quality", "source_count", "lexical_diversity"]

457 _create_metric_comparison_chart(

458 successful_results,

459 config_names,

460 quality_metrics,

461 "quality_metrics",

462 "Quality Metrics Comparison",

463 str(Path(output_dir) / f"quality_metrics_comparison_{timestamp}.png"),

464 )

465

466 # 3. Speed metrics comparison

467 speed_metrics = ["overall_speed", "total_duration", "duration_per_question"]

468 _create_metric_comparison_chart(

469 successful_results,

470 config_names,

471 speed_metrics,

472 "speed_metrics",

473 "Speed Metrics Comparison",

474 str(Path(output_dir) / f"speed_metrics_comparison_{timestamp}.png"),

475 )

476

477 # 4. Resource metrics comparison

478 resource_metrics = [

479 "overall_resource",

480 "process_memory_max_mb",

481 "system_cpu_avg",

482 ]

483 _create_metric_comparison_chart(

484 successful_results,

485 config_names,

486 resource_metrics,

487 "resource_metrics",

488 "Resource Usage Comparison",

489 str(Path(output_dir) / f"resource_metrics_comparison_{timestamp}.png"),

490 )

491

492 # 5. Spider chart for multi-dimensional comparison

493 _create_spider_chart(

494 successful_results,

495 config_names,

496 str(Path(output_dir) / f"spider_chart_comparison_{timestamp}.png"),

497 )

498

499 # 6. Pareto frontier chart for quality vs. speed

500 _create_pareto_chart(

501 successful_results,

502 str(Path(output_dir) / f"pareto_chart_comparison_{timestamp}.png"),

503 )

504

505

506def _create_metric_comparison_chart(

507 results: List[Dict[str, Any]],

508 config_names: List[str],

509 metric_keys: List[str],

510 metric_category: str,

511 title: str,

512 output_path: str,

513):

514 """

515 Create a chart comparing specific metrics across configurations.

516

517 Args:

518 results: List of configuration results

519 config_names: Names of configurations

520 metric_keys: Keys of metrics to compare

521 metric_category: Category of metrics (quality_metrics, speed_metrics, etc.)

522 title: Chart title

523 output_path: Path to save the chart

524 """

525 # Create figure with multiple subplots (one per metric)

526 fig, axes = plt.subplots(

527 len(metric_keys), 1, figsize=(12, 5 * len(metric_keys))

528 )

529

530 # Handle case with only one metric

531 if len(metric_keys) == 1:

532 axes = [axes]

533

534 for i, metric_key in enumerate(metric_keys):

535 ax = axes[i]

536

537 # Get metric values

538 metric_values = []

539 for result in results:

540 metrics = result.get("avg_metrics", {}).get(metric_category, {})

541 value = metrics.get(metric_key)

542

543 # Handle time values for better visualization

544 if "duration" in metric_key and value is not None:

545 # Convert to seconds if > 60 seconds, minutes if > 60 minutes

546 if value > 3600: 546 ↛ 547line 546 didn't jump to line 547 because the condition on line 546 was never true

547 value = value / 3600 # Convert to hours

548 metric_key += " (hours)"

549 elif value > 60: 549 ↛ 550line 549 didn't jump to line 550 because the condition on line 549 was never true

550 value = value / 60 # Convert to minutes

551 metric_key += " (minutes)"

552 else:

553 metric_key += " (seconds)"

554

555 metric_values.append(value if value is not None else 0)

556

557 # Create horizontal bar chart

558 bars = ax.barh(config_names, metric_values, color="lightblue")

559 ax.set_xlabel(metric_key.replace("_", " ").title())

560 ax.set_title(f"{metric_key.replace('_', ' ').title()}")

561 ax.grid(axis="x", linestyle="--", alpha=0.7)

562

563 # Add value labels to bars

564 for bar in bars:

565 width = bar.get_width()

566 label_x_pos = width * 1.01

567 ax.text(

568 label_x_pos,

569 bar.get_y() + bar.get_height() / 2,

570 f"{width:.2f}",

571 va="center",

572 )

573

574 plt.suptitle(title, fontsize=16)

575 plt.tight_layout()

576 plt.savefig(output_path)

577 plt.close()

578

579

580def _create_spider_chart(

581 results: List[Dict[str, Any]], config_names: List[str], output_path: str

582):

583 """

584 Create a spider chart comparing metrics across configurations.

585

586 Args:

587 results: List of configuration results

588 config_names: Names of configurations

589 output_path: Path to save the chart

590 """

591 # Try to import the radar chart module

592 try:

593 from matplotlib.path import Path

594 from matplotlib.projections import register_projection

595 from matplotlib.projections.polar import PolarAxes

596 from matplotlib.spines import Spine

597

598 def radar_factory(num_vars, frame="circle"):

599 """Create a radar chart with `num_vars` axes."""

600 # Calculate evenly-spaced axis angles

601 theta = np.linspace(0, 2 * np.pi, num_vars, endpoint=False)

602

603 class RadarAxes(PolarAxes):

604 name = "radar"

605

606 def __init__(self, *args, **kwargs):

607 super().__init__(*args, **kwargs)

608 self.set_theta_zero_location("N")

609

610 def fill(self, *args, closed=True, **kwargs):

611 return super().fill(closed=closed, *args, **kwargs)

612

613 def plot(self, *args, **kwargs):

614 return super().plot(*args, **kwargs)

615

616 def set_varlabels(self, labels):

617 self.set_thetagrids(np.degrees(theta), labels)

618

619 def _gen_axes_patch(self):

620 if frame == "circle":

621 return Circle((0.5, 0.5), 0.5)

622 elif frame == "polygon":

623 return RegularPolygon(

624 (0.5, 0.5), num_vars, radius=0.5, edgecolor="k"

625 )

626 else:

627 raise ValueError(

628 "Unknown value for 'frame': %s" % frame

629 )

630

631 def _gen_axes_spines(self):

632 if frame == "circle":

633 return super()._gen_axes_spines()

634 elif frame == "polygon":

635 spine_type = Spine.circular_spine

636 verts = unit_poly_verts(num_vars)

637 vertices = [(0.5, 0.5)] + verts

638 codes = (

639 [Path.MOVETO]

640 + [Path.LINETO] * num_vars

641 + [Path.CLOSEPOLY]

642 )

643 path = Path(vertices, codes)

644 spine = Spine(self, spine_type, path)

645 spine.set_transform(self.transAxes)

646 return {"polar": spine}

647 else:

648 raise ValueError(

649 "Unknown value for 'frame': %s" % frame

650 )

651

652 def unit_poly_verts(num_vars):

653 """Return vertices of polygon for radar chart."""

654 verts = []

655 for i in range(num_vars):

656 angle = theta[i]

657 verts.append(

658 (0.5 * (1 + np.cos(angle)), 0.5 * (1 + np.sin(angle)))

659 )

660 return verts

661

662 register_projection(RadarAxes)

663 return theta

664

665 # Select metrics for the spider chart

666 metrics = [

667 {"name": "Quality", "key": "quality_metrics.overall_quality"},

668 {"name": "Speed", "key": "speed_metrics.overall_speed"},

669 {

670 "name": "Sources",

671 "key": "quality_metrics.normalized_source_count",

672 },

673 {

674 "name": "Content",

675 "key": "quality_metrics.normalized_knowledge_length",

676 },

677 {

678 "name": "Memory",

679 "key": "resource_metrics.normalized_memory_usage",

680 "invert": True,

681 },

682 ]

683

684 # Extract metric values

685 spoke_labels = [m["name"] for m in metrics]

686 num_vars = len(spoke_labels)

687 theta = radar_factory(num_vars)

688

689 fig, ax = plt.subplots(

690 figsize=(10, 10), subplot_kw=dict(projection="radar")

691 )

692

693 # Color map for different configurations

694 colors = plt.cm.viridis(np.linspace(0, 1, len(results)))

695

696 for i, result in enumerate(results):

697 values = []

698 for metric in metrics:

699 # Extract metric value using the key path (e.g., "quality_metrics.overall_quality")

700 key_parts = metric["key"].split(".")

701 value = result.get("avg_metrics", {})

702 for part in key_parts:

703 value = value.get(part, 0) if isinstance(value, dict) else 0

704

705 # Invert if needed (for metrics where lower is better)

706 if metric.get("invert", False):

707 value = 1.0 - value

708

709 values.append(value)

710

711 # Plot this configuration

712 ax.plot(

713 theta,

714 values,

715 color=colors[i],

716 linewidth=2,

717 label=config_names[i],

718 )

719 ax.fill(theta, values, color=colors[i], alpha=0.25)

720

721 # Set chart properties

722 ax.set_varlabels(spoke_labels)

723 plt.legend(loc="best", bbox_to_anchor=(0.5, 0.1))

724 plt.title("Multi-Dimensional Configuration Comparison", size=16, y=1.05)

725 plt.tight_layout()

726

727 # Save chart

728 plt.savefig(output_path)

729 plt.close()

730

731 except Exception as e:

732 logger.exception("Error creating spider chart")

733 # Create a text-based chart as fallback

734 plt.figure(figsize=(10, 6))

735 plt.text(

736 0.5,

737 0.5,

738 f"Spider chart could not be created: {e!s}",

739 horizontalalignment="center",

740 verticalalignment="center",

741 )

742 plt.axis("off")

743 plt.savefig(output_path)

744 plt.close()

745

746

747def _create_pareto_chart(results: List[Dict[str, Any]], output_path: str):

748 """

749 Create a Pareto frontier chart showing quality vs. speed tradeoff.

750

751 Args:

752 results: List of configuration results

753 output_path: Path to save the chart

754 """

755 # Extract quality and speed metrics

756 quality_scores = []

757 speed_scores = []

758 names = []

759

760 for result in results:

761 metrics = result.get("avg_metrics", {})

762 quality = metrics.get("quality_metrics", {}).get("overall_quality", 0)

763

764 # For speed, we use inverse of duration (so higher is better)

765 duration = metrics.get("speed_metrics", {}).get("total_duration", 1)

766 speed = 1.0 / max(duration, 0.001) # Avoid division by zero

767

768 quality_scores.append(quality)

769 speed_scores.append(speed)

770 names.append(result.get("name", "Configuration"))

771

772 # Create scatter plot

773 plt.figure(figsize=(10, 8))

774 plt.scatter(quality_scores, speed_scores, s=100, alpha=0.7)

775

776 # Add labels for each point

777 for i, name in enumerate(names):

778 plt.annotate(

779 name,

780 (quality_scores[i], speed_scores[i]),

781 xytext=(5, 5),

782 textcoords="offset points",

783 )

784

785 # Identify Pareto frontier

786 pareto_points = []

787 for i, (q, s) in enumerate(zip(quality_scores, speed_scores, strict=False)):

788 is_pareto = True

789 for q2, s2 in zip(quality_scores, speed_scores, strict=False):

790 if q2 > q and s2 > s: # Dominated 790 ↛ 791line 790 didn't jump to line 791 because the condition on line 790 was never true

791 is_pareto = False

792 break

793 if is_pareto: 793 ↛ 787line 793 didn't jump to line 787 because the condition on line 793 was always true

794 pareto_points.append(i)

795

796 # Highlight Pareto frontier

797 pareto_quality = [quality_scores[i] for i in pareto_points]

798 pareto_speed = [speed_scores[i] for i in pareto_points]

799

800 # Sort pareto points for line drawing

801 pareto_sorted = sorted(

802 zip(pareto_quality, pareto_speed, pareto_points, strict=False)

803 )

804 pareto_quality = [p[0] for p in pareto_sorted]

805 pareto_speed = [p[1] for p in pareto_sorted]

806 pareto_indices = [p[2] for p in pareto_sorted]

807

808 # Draw Pareto frontier line

809 plt.plot(pareto_quality, pareto_speed, "r--", linewidth=2)

810

811 # Highlight Pareto optimal points

812 plt.scatter(

813 [quality_scores[i] for i in pareto_indices],

814 [speed_scores[i] for i in pareto_indices],

815 s=150,

816 facecolors="none",

817 edgecolors="r",

818 linewidth=2,

819 )

820

821 # Add labels for Pareto optimal configurations

822 for i in pareto_indices:

823 plt.annotate(

824 names[i],

825 (quality_scores[i], speed_scores[i]),

826 xytext=(8, 8),

827 textcoords="offset points",

828 bbox=dict(boxstyle="round,pad=0.5", fc="yellow", alpha=0.7),

829 )

830

831 # Set chart properties

832 plt.xlabel("Quality Score (higher is better)")

833 plt.ylabel("Speed Score (higher is better)")

834 plt.title("Quality vs. Speed Tradeoff (Pareto Frontier)", size=14)

835 plt.grid(True, linestyle="--", alpha=0.7)

836

837 # Add explanation

838 plt.figtext(

839 0.5,

840 0.01,

841 "Points on the red line are Pareto optimal configurations\n"

842 "(no other configuration is better in both quality and speed)",

843 ha="center",

844 fontsize=10,

845 bbox=dict(boxstyle="round", fc="white", alpha=0.7),

846 )

847

848 plt.tight_layout()

849 plt.savefig(output_path)

850 plt.close()

Coverage for src / local_deep_research / benchmarks / comparison / evaluator.py: 84%

301 statements