Coverage for src/local_deep_research/benchmarks/optimization/optuna

1"""

2Optuna-based parameter optimizer for Local Deep Research.

4This module provides the core optimization functionality using Optuna

5to find optimal parameters for the research system, balancing quality

6and performance metrics.

7"""

9import os

10from pathlib import Path

11import time

12from datetime import datetime, UTC

13from functools import partial

14from typing import Any, Callable, Dict, List, Optional, Tuple

16import joblib

17import numpy as np

18import optuna

19from optuna.visualization import (

20 plot_contour,

21 plot_optimization_history,

22 plot_param_importances,

23 plot_slice,

24)

26from local_deep_research.benchmarks.efficiency.speed_profiler import (

27 SpeedProfiler,

28)

29from local_deep_research.security import sanitize_data

30from loguru import logger

32from local_deep_research.benchmarks.evaluators import (

33 CompositeBenchmarkEvaluator,

34)

36# Import benchmark evaluator components

38# Try to import visualization libraries, but don't fail if not available

39try:

40 import matplotlib.pyplot as plt

41 from matplotlib.lines import Line2D

43 # We'll use matplotlib for plotting visualization results

45 PLOTTING_AVAILABLE = True

46except ImportError:

47 PLOTTING_AVAILABLE = False

48 logger.warning("Matplotlib not available, visualization will be limited")

51class OptunaOptimizer:

52 """

53 Optimize parameters for Local Deep Research using Optuna.

55 This class provides functionality to:

56 1. Define search spaces for parameter optimization

57 2. Evaluate parameter combinations using objective functions

58 3. Find optimal parameters via Optuna

59 4. Visualize and analyze optimization results

60 """

62 def __init__(

63 self,

64 base_query: str,

65 output_dir: str = "optimization_results",

66 model_name: Optional[str] = None,

67 provider: Optional[str] = None,

68 search_tool: Optional[str] = None,

69 temperature: float = 0.7,

70 n_trials: int = 30,

71 timeout: Optional[int] = None,

72 n_jobs: int = 1,

73 study_name: Optional[str] = None,

74 optimization_metrics: Optional[List[str]] = None,

75 metric_weights: Optional[Dict[str, float]] = None,

76 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,

77 benchmark_weights: Optional[Dict[str, float]] = None,

78 ):

79 """

80 Initialize the optimizer.

82 Args:

83 base_query: The research query to use for all experiments

84 output_dir: Directory to save optimization results

85 model_name: Name of the LLM model to use

86 provider: LLM provider

87 search_tool: Search engine to use

88 temperature: LLM temperature

89 n_trials: Number of parameter combinations to try

90 timeout: Maximum seconds to run optimization (None for no limit)

91 n_jobs: Number of parallel jobs for optimization

92 study_name: Name of the Optuna study

93 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])

94 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})

95 progress_callback: Optional callback for progress updates

96 benchmark_weights: Dictionary mapping benchmark types to weights

97 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})

98 If None, only SimpleQA is used with weight 1.0

99 """

100 self.base_query = base_query

101 self.output_dir = output_dir

102 self.model_name = model_name

103 self.provider = provider

104 self.search_tool = search_tool

105 self.temperature = temperature

106 self.n_trials = n_trials

107 self.timeout = timeout

108 self.n_jobs = n_jobs

109 self.optimization_metrics = optimization_metrics or ["quality", "speed"]

110 self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}

111 self.progress_callback = progress_callback

112

113 # Initialize benchmark evaluator with weights

114 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}

115 self.benchmark_evaluator = CompositeBenchmarkEvaluator(

116 self.benchmark_weights

117 )

118

119 # Normalize weights to sum to 1.0

120 total_weight = sum(self.metric_weights.values())

121 if total_weight > 0: 121 ↛ 127line 121 didn't jump to line 127 because the condition on line 121 was always true

122 self.metric_weights = {

123 k: v / total_weight for k, v in self.metric_weights.items()

124 }

125

126 # Generate a unique study name if not provided

127 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

128 self.study_name = study_name or f"ldr_opt_{timestamp}"

129

130 # Create output directory

131 os.makedirs(output_dir, exist_ok=True)

132

133 # Store the trial history for analysis

134 self.trials_history = []

135

136 # Storage for the best parameters and study

137 self.best_params = None

138 self.study = None

139

140 def optimize(

141 self, param_space: Optional[Dict[str, Any]] = None

142 ) -> Tuple[Dict[str, Any], float]:

143 """

144 Run the optimization process using Optuna.

145

146 Args:

147 param_space: Dictionary defining parameter search spaces

148 (if None, use default spaces)

149

150 Returns:

151 Tuple containing (best_parameters, best_score)

152 """

153 param_space = param_space or self._get_default_param_space()

154

155 # Create a study object

156 storage_name = f"sqlite:///{self.output_dir}/{self.study_name}.db"

157 self.study = optuna.create_study(

158 study_name=self.study_name,

159 storage=storage_name,

160 load_if_exists=True,

161 direction="maximize",

162 sampler=optuna.samplers.TPESampler(seed=42),

163 )

164

165 # Create partial function with param_space

166 objective = partial(self._objective, param_space=param_space)

167

168 # Log optimization start

169 logger.info(

170 f"Starting optimization with {self.n_trials} trials, {self.n_jobs} parallel jobs"

171 )

172 logger.info(f"Parameter space: {param_space}")

173 logger.info(f"Metric weights: {self.metric_weights}")

174 logger.info(f"Benchmark weights: {self.benchmark_weights}")

175

176 # Initialize progress tracking

177 if self.progress_callback: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 self.progress_callback(

179 0,

180 self.n_trials,

181 {

182 "status": "starting",

183 "stage": "initialization",

184 "trials_completed": 0,

185 "total_trials": self.n_trials,

186 },

187 )

188

189 try:

190 # Run optimization

191 self.study.optimize(

192 objective,

193 n_trials=self.n_trials,

194 timeout=self.timeout,

195 n_jobs=self.n_jobs,

196 callbacks=[self._optimization_callback],

197 show_progress_bar=True,

198 )

199

200 # Store best parameters

201 self.best_params = self.study.best_params

202

203 # Save the results

204 self._save_results()

205

206 # Create visualizations

207 self._create_visualizations()

208

209 logger.info(

210 f"Optimization complete. Best parameters: {self.best_params}"

211 )

212 logger.info(f"Best value: {self.study.best_value}")

213

214 # Report completion

215 if self.progress_callback: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 self.progress_callback(

217 self.n_trials,

218 self.n_trials,

219 {

220 "status": "completed",

221 "stage": "finished",

222 "trials_completed": len(self.study.trials),

223 "total_trials": self.n_trials,

224 "best_params": self.best_params,

225 "best_value": self.study.best_value,

226 },

227 )

228

229 return self.best_params, self.study.best_value

230

231 except KeyboardInterrupt:

232 logger.info("Optimization interrupted by user")

233 # Still save what we have

234 self._save_results()

235 self._create_visualizations()

236

237 # Report interruption

238 if self.progress_callback:

239 self.progress_callback(

240 len(self.study.trials),

241 self.n_trials,

242 {

243 "status": "interrupted",

244 "stage": "interrupted",

245 "trials_completed": len(self.study.trials),

246 "total_trials": self.n_trials,

247 "best_params": self.study.best_params,

248 "best_value": self.study.best_value,

249 },

250 )

251

252 return self.study.best_params, self.study.best_value

253

254 def _get_default_param_space(self) -> Dict[str, Any]:

255 """

256 Get default parameter search space.

257

258 Returns:

259 Dictionary defining the default parameter search spaces

260 """

261 return {

262 "iterations": {

263 "type": "int",

264 "low": 1,

265 "high": 5,

266 "step": 1,

267 },

268 "questions_per_iteration": {

269 "type": "int",

270 "low": 1,

271 "high": 5,

272 "step": 1,

273 },

274 "search_strategy": {

275 "type": "categorical",

276 "choices": [

277 "iterdrag",

278 "standard",

279 "rapid",

280 "parallel",

281 "source_based",

282 ],

283 },

284 "max_results": {

285 "type": "int",

286 "low": 10,

287 "high": 100,

288 "step": 10,

289 },

290 }

291

292 def _objective(

293 self, trial: optuna.Trial, param_space: Dict[str, Any]

294 ) -> float:

295 """

296 Objective function for Optuna optimization.

297

298 Args:

299 trial: Optuna trial object

300 param_space: Dictionary defining parameter search spaces

301

302 Returns:

303 Score to maximize

304 """

305 # Generate parameters for this trial

306 params = {}

307 for param_name, param_config in param_space.items():

308 param_type = param_config["type"]

309

310 if param_type == "int":

311 params[param_name] = trial.suggest_int(

312 param_name,

313 param_config["low"],

314 param_config["high"],

315 step=param_config.get("step", 1),

316 )

317 elif param_type == "float": 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 params[param_name] = trial.suggest_float(

319 param_name,

320 param_config["low"],

321 param_config["high"],

322 step=param_config.get("step"),

323 log=param_config.get("log", False),

324 )

325 elif param_type == "categorical": 325 ↛ 307line 325 didn't jump to line 307 because the condition on line 325 was always true

326 params[param_name] = trial.suggest_categorical(

327 param_name, param_config["choices"]

328 )

329

330 # Log the trial parameters

331 logger.info(f"Trial {trial.number}: {params}")

332

333 # Update progress callback if available

334 if self.progress_callback: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true

335 self.progress_callback(

336 trial.number,

337 self.n_trials,

338 {

339 "status": "running",

340 "stage": "trial_started",

341 "trial_number": trial.number,

342 "params": params,

343 "trials_completed": trial.number,

344 "total_trials": self.n_trials,

345 },

346 )

347

348 # Run an experiment with these parameters

349 try:

350 start_time = time.time()

351 result = self._run_experiment(params)

352 duration = time.time() - start_time

353

354 # Store details about the trial

355 trial_info = {

356 "trial_number": trial.number,

357 "params": params,

358 "result": result,

359 "score": result.get("score", 0),

360 "duration": duration,

361 "timestamp": datetime.now(UTC).isoformat(),

362 }

363 self.trials_history.append(trial_info)

364

365 # Update callback with results

366 if self.progress_callback: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 self.progress_callback(

368 trial.number,

369 self.n_trials,

370 {

371 "status": "completed",

372 "stage": "trial_completed",

373 "trial_number": trial.number,

374 "params": params,

375 "score": result.get("score", 0),

376 "trials_completed": trial.number + 1,

377 "total_trials": self.n_trials,

378 },

379 )

380

381 logger.info(

382 f"Trial {trial.number} completed: {params}, score: {result['score']:.4f}"

383 )

384

385 return result["score"]

386 except Exception as e:

387 logger.exception(f"Error in trial {trial.number}")

388

389 # Update callback with error

390 if self.progress_callback: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 self.progress_callback(

392 trial.number,

393 self.n_trials,

394 {

395 "status": "error",

396 "stage": "trial_error",

397 "trial_number": trial.number,

398 "params": params,

399 "error": str(e),

400 "trials_completed": trial.number,

401 "total_trials": self.n_trials,

402 },

403 )

404

405 return float("-inf") # Return a very low score for failed trials

406

407 def _run_experiment(self, params: Dict[str, Any]) -> Dict[str, Any]:

408 """

409 Run a single experiment with the given parameters.

410

411 Args:

412 params: Dictionary of parameters to test

413

414 Returns:

415 Results dictionary with metrics and score

416 """

417 # Extract parameters

418 iterations = params.get("iterations", 2)

419 questions_per_iteration = params.get("questions_per_iteration", 2)

420 search_strategy = params.get("search_strategy", "iterdrag")

421 max_results = params.get("max_results", 50)

422

423 # Initialize profiling tools

424 speed_profiler = SpeedProfiler()

425

426 # Start profiling

427 speed_profiler.start()

428

429 try:

430 # Create system configuration

431 system_config = {

432 "iterations": iterations,

433 "questions_per_iteration": questions_per_iteration,

434 "search_strategy": search_strategy,

435 "search_tool": self.search_tool,

436 "max_results": max_results,

437 "model_name": self.model_name,

438 "provider": self.provider,

439 }

440

441 # Evaluate quality using composite benchmark evaluator

442 # Use a small number of examples for efficiency

443 benchmark_dir = str(Path(self.output_dir) / "benchmark_temp")

444 quality_results = self.benchmark_evaluator.evaluate(

445 system_config=system_config,

446 num_examples=5, # Small number for optimization efficiency

447 output_dir=benchmark_dir,

448 )

449

450 # Stop timing

451 speed_profiler.stop()

452 timing_results = speed_profiler.get_summary()

453

454 # Extract key metrics

455 quality_score = quality_results.get("quality_score", 0.0)

456 benchmark_results = quality_results.get("benchmark_results", {})

457

458 # Speed score: convert duration to a 0-1 score where faster is better

459 # Using a reasonable threshold (e.g., 180 seconds for 5 examples)

460 # Below this threshold: high score, above it: declining score

461 total_duration = timing_results.get("total_duration", 180)

462 speed_score = max(0.0, min(1.0, 1.0 - (total_duration - 60) / 180))

463

464 # Calculate combined score based on weights

465 combined_score = (

466 self.metric_weights.get("quality", 0.6) * quality_score

467 + self.metric_weights.get("speed", 0.4) * speed_score

468 )

469

470 # Return streamlined results

471 return {

472 "quality_score": quality_score,

473 "benchmark_results": benchmark_results,

474 "speed_score": speed_score,

475 "total_duration": total_duration,

476 "score": combined_score,

477 "success": True,

478 }

479

480 except Exception as e:

481 # Stop profiling on error

482 speed_profiler.stop()

483

484 # Log error

485 logger.exception("Error in experiment")

486

487 # Return error information

488 return {"error": str(e), "score": 0.0, "success": False}

489

490 def _optimization_callback(self, study: optuna.Study, trial: optuna.Trial):

491 """

492 Callback for the Optuna optimization process.

493

494 Args:

495 study: Optuna study object

496 trial: Current trial

497 """

498 # Save intermediate results periodically

499 if trial.number % 10 == 0 and trial.number > 0:

500 self._save_results()

501 self._create_quick_visualizations()

502

503 def _save_results(self):

504 """Save the optimization results to disk."""

505 # Create a timestamp for filenames

506 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

507

508 # Save trial history

509 from ...security.file_write_verifier import write_json_verified

510

511 history_file = str(

512 Path(self.output_dir) / f"{self.study_name}_history.json"

513 )

514

515 # Convert numpy values to native Python types for JSON serialization

516 clean_history = []

517 for trial in self.trials_history:

518 clean_trial = {}

519 for k, v in trial.items():

520 if isinstance(v, dict):

521 clean_trial[k] = {

522 dk: (float(dv) if isinstance(dv, np.number) else dv)

523 for dk, dv in v.items()

524 }

525 elif isinstance(v, np.number): 525 ↛ 526line 525 didn't jump to line 526 because the condition on line 525 was never true

526 clean_trial[k] = float(v)

527 else:

528 clean_trial[k] = v

529 clean_history.append(clean_trial)

530

531 # Sanitize sensitive data before writing to disk

532 sanitized_history = sanitize_data(clean_history)

533

534 write_json_verified(

535 history_file,

536 sanitized_history,

537 "benchmark.allow_file_output",

538 context="optimization history",

539 )

540

541 # Save current best parameters

542 if ( 542 ↛ 575line 542 didn't jump to line 575 because the condition on line 542 was always true

543 self.study

544 and hasattr(self.study, "best_params")

545 and self.study.best_params

546 ):

547 best_params_file = str(

548 Path(self.output_dir) / f"{self.study_name}_best_params.json"

549 )

550

551 best_params_data = {

552 "best_params": self.study.best_params,

553 "best_value": float(self.study.best_value),

554 "n_trials": len(self.study.trials),

555 "timestamp": timestamp,

556 "base_query": self.base_query,

557 "model_name": self.model_name,

558 "provider": self.provider,

559 "search_tool": self.search_tool,

560 "metric_weights": self.metric_weights,

561 "benchmark_weights": self.benchmark_weights,

562 }

563

564 # Sanitize sensitive data before writing to disk

565 sanitized_best_params = sanitize_data(best_params_data)

566

567 write_json_verified(

568 best_params_file,

569 sanitized_best_params,

570 "benchmark.allow_file_output",

571 context="optimization best params",

572 )

573

574 # Save the Optuna study

575 if self.study: 575 ↛ 581line 575 didn't jump to line 581 because the condition on line 575 was always true

576 study_file = str(

577 Path(self.output_dir) / f"{self.study_name}_study.pkl"

578 )

579 joblib.dump(self.study, study_file)

580

581 logger.info(f"Results saved to {self.output_dir}")

582

583 def _create_visualizations(self):

584 """Create and save comprehensive visualizations of the optimization results."""

585 if not PLOTTING_AVAILABLE: 585 ↛ 586line 585 didn't jump to line 586 because the condition on line 585 was never true

586 logger.warning(

587 "Matplotlib not available, skipping visualization creation"

588 )

589 return

590

591 if not self.study or len(self.study.trials) < 2:

592 logger.warning("Not enough trials to create visualizations")

593 return

594

595 # Create directory for visualizations

596 viz_dir = Path(self.output_dir) / "visualizations"

597 viz_dir.mkdir(parents=True, exist_ok=True)

598 viz_dir = str(viz_dir)

599

600 # Create Optuna visualizations

601 self._create_optuna_visualizations(viz_dir)

602

603 # Create custom visualizations

604 self._create_custom_visualizations(viz_dir)

605

606 logger.info(f"Visualizations saved to {viz_dir}")

607

608 def _create_quick_visualizations(self):

609 """Create a smaller set of visualizations for intermediate progress."""

610 if (

611 not PLOTTING_AVAILABLE

612 or not self.study

613 or len(self.study.trials) < 2

614 ):

615 return

616

617 # Create directory for visualizations

618 viz_dir = Path(self.output_dir) / "visualizations"

619 viz_dir.mkdir(parents=True, exist_ok=True)

620 viz_dir = str(viz_dir)

621

622 # Create optimization history only (faster than full visualization)

623 try:

624 fig = plot_optimization_history(self.study)

625 fig.write_image(

626 str(

627 Path(viz_dir)

628 / f"{self.study_name}_optimization_history_current.png"

629 )

630 )

631 except Exception:

632 logger.exception("Error creating optimization history plot")

633

634 def _create_optuna_visualizations(self, viz_dir: str):

635 """

636 Create and save Optuna's built-in visualizations.

637

638 Args:

639 viz_dir: Directory to save visualizations

640 """

641 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

642

643 # 1. Optimization history

644 try:

645 fig = plot_optimization_history(self.study)

646 fig.write_image(

647 str(

648 Path(viz_dir)

649 / f"{self.study_name}_optimization_history_{timestamp}.png"

650 )

651 )

652 except Exception:

653 logger.exception("Error creating optimization history plot")

654

655 # 2. Parameter importances

656 try:

657 fig = plot_param_importances(self.study)

658 fig.write_image(

659 str(

660 Path(viz_dir)

661 / f"{self.study_name}_param_importances_{timestamp}.png"

662 )

663 )

664 except Exception:

665 logger.exception("Error creating parameter importances plot")

666

667 # 3. Slice plot for each parameter

668 try:

669 for param_name in self.study.best_params.keys():

670 fig = plot_slice(self.study, [param_name])

671 fig.write_image(

672 str(

673 Path(viz_dir)

674 / f"{self.study_name}_slice_{param_name}_{timestamp}.png"

675 )

676 )

677 except Exception:

678 logger.exception("Error creating slice plots")

679

680 # 4. Contour plots for important parameter pairs

681 try:

682 # Get all parameter names

683 param_names = list(self.study.best_params.keys())

684

685 # Create contour plots for each pair

686 for i in range(len(param_names)):

687 for j in range(i + 1, len(param_names)): 687 ↛ 688line 687 didn't jump to line 688 because the loop on line 687 never started

688 try:

689 fig = plot_contour(

690 self.study, params=[param_names[i], param_names[j]]

691 )

692 fig.write_image(

693 str(

694 Path(viz_dir)

695 / f"{self.study_name}_contour_{param_names[i]}_{param_names[j]}_{timestamp}.png"

696 )

697 )

698 except Exception as e:

699 logger.warning(

700 f"Error creating contour plot for {param_names[i]} vs {param_names[j]}: {e!s}"

701 )

702 except Exception:

703 logger.exception("Error creating contour plots")

704

705 def _create_custom_visualizations(self, viz_dir: str):

706 """

707 Create custom visualizations based on trial history.

708

709 Args:

710 viz_dir: Directory to save visualizations

711 """

712 if not self.trials_history: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 return

714

715 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

716

717 # Create quality vs speed plot

718 self._create_quality_vs_speed_plot(viz_dir, timestamp)

719

720 # Create parameter evolution plots

721 self._create_parameter_evolution_plots(viz_dir, timestamp)

722

723 # Create trial duration vs score plot

724 self._create_duration_vs_score_plot(viz_dir, timestamp)

725

726 def _create_quality_vs_speed_plot(self, viz_dir: str, timestamp: str):

727 """Create a plot showing quality vs. speed trade-off."""

728 if not self.trials_history: 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true

729 return

730

731 # Extract data from successful trials

732 successful_trials = [

733 t

734 for t in self.trials_history

735 if t.get("result", {}).get("success", False)

736 ]

737

738 if not successful_trials: 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true

739 logger.warning("No successful trials for visualization")

740 return

741

742 try:

743 plt.figure(figsize=(10, 8))

744

745 # Extract metrics

746 quality_scores = []

747 speed_scores = []

748 labels = []

749 iterations_values = []

750 questions_values = []

751

752 for trial in successful_trials: 752 ↛ 766line 752 didn't jump to line 766 because the loop on line 752 didn't complete

753 result = trial["result"]

754 quality = result.get("quality_score", 0)

755 speed = result.get("speed_score", 0)

756 iterations = trial["params"].get("iterations", 0)

757 questions = trial["params"].get("questions_per_iteration", 0)

758

759 quality_scores.append(quality)

760 speed_scores.append(speed)

761 labels.append(f"Trial {trial['trial_number']}")

762 iterations_values.append(iterations)

763 questions_values.append(questions)

764

765 # Create scatter plot with size based on iterations*questions

766 sizes = [

767 i * q * 5

768 for i, q in zip(

769 iterations_values, questions_values, strict=False

770 )

771 ]

772 scatter = plt.scatter(

773 quality_scores,

774 speed_scores,

775 s=sizes,

776 alpha=0.7,

777 c=range(len(quality_scores)),

778 cmap="viridis",

779 )

780

781 # Highlight best trial

782 best_trial = max(

783 successful_trials,

784 key=lambda x: x.get("result", {}).get("score", 0),

785 )

786 best_quality = best_trial["result"].get("quality_score", 0)

787 best_speed = best_trial["result"].get("speed_score", 0)

788 best_iter = best_trial["params"].get("iterations", 0)

789 best_questions = best_trial["params"].get(

790 "questions_per_iteration", 0

791 )

792

793 plt.scatter(

794 [best_quality],

795 [best_speed],

796 s=200,

797 facecolors="none",

798 edgecolors="red",

799 linewidth=2,

800 label=f"Best: {best_iter}×{best_questions}",

801 )

802

803 # Add annotations for key points

804 for i, (q, s, label) in enumerate(

805 zip(quality_scores, speed_scores, labels, strict=False)

806 ):

807 if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points

808 plt.annotate(

809 f"{iterations_values[i]}×{questions_values[i]}",

810 (q, s),

811 xytext=(5, 5),

812 textcoords="offset points",

813 )

814

815 # Add colorbar and labels

816 cbar = plt.colorbar(scatter)

817 cbar.set_label("Trial Progression")

818

819 # Add benchmark weight information

820 weights_str = ", ".join(

821 [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()]

822 )

823 plt.title(

824 f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}"

825 )

826 plt.xlabel("Quality Score (Benchmark Accuracy)")

827 plt.ylabel("Speed Score")

828 plt.grid(True, linestyle="--", alpha=0.7)

829

830 # Add legend explaining size

831 legend_elements = [

832 Line2D(

833 [0],

834 [0],

835 marker="o",

836 color="w",

837 markerfacecolor="gray",

838 markersize=np.sqrt(n * 5 / np.pi),

839 label=f"{n} Total Questions",

840 )

841 for n in [5, 10, 15, 20, 25]

842 ]

843 plt.legend(handles=legend_elements, title="Workload")

844

845 # Save the figure

846 plt.tight_layout()

847 plt.savefig(

848 str(

849 Path(viz_dir)

850 / f"{self.study_name}_quality_vs_speed_{timestamp}.png"

851 )

852 )

853 plt.close()

854 except Exception:

855 logger.exception("Error creating quality vs speed plot")

856

857 def _create_parameter_evolution_plots(self, viz_dir: str, timestamp: str):

858 """Create plots showing how parameter values evolve over trials."""

859 try:

860 successful_trials = [

861 t

862 for t in self.trials_history

863 if t.get("result", {}).get("success", False)

864 ]

865

866 if not successful_trials or len(successful_trials) < 5: 866 ↛ 870line 866 didn't jump to line 870 because the condition on line 866 was always true

867 return

868

869 # Get key parameters

870 main_params = list(successful_trials[0]["params"].keys())

871

872 # For each parameter, plot its values over trials

873 for param_name in main_params:

874 plt.figure(figsize=(12, 6))

875

876 trial_numbers = []

877 param_values = []

878 scores = []

879

880 for trial in self.trials_history:

881 if "params" in trial and param_name in trial["params"]:

882 trial_numbers.append(trial["trial_number"])

883 param_values.append(trial["params"][param_name])

884 scores.append(trial.get("score", 0))

885

886 # Create evolution plot

887 scatter = plt.scatter(

888 trial_numbers,

889 param_values,

890 c=scores,

891 cmap="plasma",

892 alpha=0.8,

893 s=80,

894 )

895

896 # Add best trial marker

897 best_trial_idx = scores.index(max(scores))

898 plt.scatter(

899 [trial_numbers[best_trial_idx]],

900 [param_values[best_trial_idx]],

901 s=150,

902 facecolors="none",

903 edgecolors="red",

904 linewidth=2,

905 label=f"Best Value: {param_values[best_trial_idx]}",

906 )

907

908 # Add colorbar

909 cbar = plt.colorbar(scatter)

910 cbar.set_label("Score")

911

912 # Set chart properties

913 plt.title(f"Evolution of {param_name} Values")

914 plt.xlabel("Trial Number")

915 plt.ylabel(param_name)

916 plt.grid(True, linestyle="--", alpha=0.7)

917 plt.legend()

918

919 # For categorical parameters, adjust y-axis

920 if isinstance(param_values[0], str):

921 unique_values = sorted(set(param_values))

922 plt.yticks(range(len(unique_values)), unique_values)

923

924 # Save the figure

925 plt.tight_layout()

926 plt.savefig(

927 str(

928 Path(viz_dir)

929 / f"{self.study_name}_param_evolution_{param_name}_{timestamp}.png"

930 )

931 )

932 plt.close()

933 except Exception:

934 logger.exception("Error creating parameter evolution plots")

935

936 def _create_duration_vs_score_plot(self, viz_dir: str, timestamp: str):

937 """Create a plot showing trial duration vs score."""

938 try:

939 plt.figure(figsize=(10, 6))

940

941 successful_trials = [

942 t

943 for t in self.trials_history

944 if t.get("result", {}).get("success", False)

945 ]

946

947 if not successful_trials: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true

948 return

949

950 trial_durations = []

951 trial_scores = []

952 trial_iterations = []

953 trial_questions = []

954

955 for trial in successful_trials:

956 duration = trial.get("duration", 0)

957 score = trial.get("score", 0)

958 iterations = trial.get("params", {}).get("iterations", 1)

959 questions = trial.get("params", {}).get(

960 "questions_per_iteration", 1

961 )

962

963 trial_durations.append(duration)

964 trial_scores.append(score)

965 trial_iterations.append(iterations)

966 trial_questions.append(questions)

967

968 # Total questions per trial

969 total_questions = [

970 i * q

971 for i, q in zip(trial_iterations, trial_questions, strict=False)

972 ]

973

974 # Create scatter plot with size based on total questions

975 plt.scatter(

976 trial_durations,

977 trial_scores,

978 s=[

979 q * 5 for q in total_questions

980 ], # Size based on total questions

981 alpha=0.7,

982 c=range(len(trial_durations)),

983 cmap="viridis",

984 )

985

986 # Add labels

987 plt.xlabel("Trial Duration (seconds)")

988 plt.ylabel("Score")

989 plt.title("Trial Duration vs. Score")

990 plt.grid(True, linestyle="--", alpha=0.7)

991

992 # Add trial number annotations for selected points

993 for i, (d, s) in enumerate(

994 zip(trial_durations, trial_scores, strict=False)

995 ):

996 if ( 996 ↛ 993line 996 didn't jump to line 993 because the condition on line 996 was always true

997 i % max(1, len(trial_durations) // 5) == 0

998 ): # Annotate ~5 points

999 plt.annotate(

1000 f"{trial_iterations[i]}×{trial_questions[i]}",

1001 (d, s),

1002 xytext=(5, 5),

1003 textcoords="offset points",

1004 )

1005

1006 # Save the figure

1007 plt.tight_layout()

1008 plt.savefig(

1009 str(

1010 Path(viz_dir)

1011 / f"{self.study_name}_duration_vs_score_{timestamp}.png"

1012 )

1013 )

1014 plt.close()

1015 except Exception:

1016 logger.exception("Error creating duration vs score plot")

1017

1018

1019def optimize_parameters(

1020 query: str,

1021 param_space: Optional[Dict[str, Any]] = None,

1022 output_dir: str = str(Path("data") / "optimization_results"),

1023 model_name: Optional[str] = None,

1024 provider: Optional[str] = None,

1025 search_tool: Optional[str] = None,

1026 temperature: float = 0.7,

1027 n_trials: int = 30,

1028 timeout: Optional[int] = None,

1029 n_jobs: int = 1,

1030 study_name: Optional[str] = None,

1031 optimization_metrics: Optional[List[str]] = None,

1032 metric_weights: Optional[Dict[str, float]] = None,

1033 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,

1034 benchmark_weights: Optional[Dict[str, float]] = None,

1035) -> Tuple[Dict[str, Any], float]:

1036 """

1037 Optimize parameters for Local Deep Research.

1038

1039 Args:

1040 query: The research query to use for all experiments

1041 param_space: Dictionary defining parameter search spaces (optional)

1042 output_dir: Directory to save optimization results

1043 model_name: Name of the LLM model to use

1044 provider: LLM provider

1045 search_tool: Search engine to use

1046 temperature: LLM temperature

1047 n_trials: Number of parameter combinations to try

1048 timeout: Maximum seconds to run optimization (None for no limit)

1049 n_jobs: Number of parallel jobs for optimization

1050 study_name: Name of the Optuna study

1051 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])

1052 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})

1053 progress_callback: Optional callback for progress updates

1054 benchmark_weights: Dictionary mapping benchmark types to weights

1055 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})

1056 If None, only SimpleQA is used with weight 1.0

1057

1058 Returns:

1059 Tuple of (best_parameters, best_score)

1060 """

1061 # Create optimizer

1062 optimizer = OptunaOptimizer(

1063 base_query=query,

1064 output_dir=output_dir,

1065 model_name=model_name,

1066 provider=provider,

1067 search_tool=search_tool,

1068 temperature=temperature,

1069 n_trials=n_trials,

1070 timeout=timeout,

1071 n_jobs=n_jobs,

1072 study_name=study_name,

1073 optimization_metrics=optimization_metrics,

1074 metric_weights=metric_weights,

1075 progress_callback=progress_callback,

1076 benchmark_weights=benchmark_weights,

1077 )

1078

1079 # Run optimization

1080 return optimizer.optimize(param_space)

1081

1082

1083def optimize_for_speed(

1084 query: str,

1085 n_trials: int = 20,

1086 output_dir: str = str(Path("data") / "optimization_results"),

1087 model_name: Optional[str] = None,

1088 provider: Optional[str] = None,

1089 search_tool: Optional[str] = None,

1090 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,

1091 benchmark_weights: Optional[Dict[str, float]] = None,

1092) -> Tuple[Dict[str, Any], float]:

1093 """

1094 Optimize parameters with a focus on speed performance.

1095

1096 Args:

1097 query: The research query to use for all experiments

1098 n_trials: Number of parameter combinations to try

1099 output_dir: Directory to save optimization results

1100 model_name: Name of the LLM model to use

1101 provider: LLM provider

1102 search_tool: Search engine to use

1103 progress_callback: Optional callback for progress updates

1104 benchmark_weights: Dictionary mapping benchmark types to weights

1105 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})

1106 If None, only SimpleQA is used with weight 1.0

1107

1108 Returns:

1109 Tuple of (best_parameters, best_score)

1110 """

1111 # Focus on speed with reduced parameter space

1112 param_space = {

1113 "iterations": {

1114 "type": "int",

1115 "low": 1,

1116 "high": 3,

1117 "step": 1,

1118 },

1119 "questions_per_iteration": {

1120 "type": "int",

1121 "low": 1,

1122 "high": 3,

1123 "step": 1,

1124 },

1125 "search_strategy": {

1126 "type": "categorical",

1127 "choices": ["rapid", "parallel", "source_based"],

1128 },

1129 }

1130

1131 # Speed-focused weights

1132 metric_weights = {"speed": 0.8, "quality": 0.2}

1133

1134 return optimize_parameters(

1135 query=query,

1136 param_space=param_space,

1137 output_dir=output_dir,

1138 model_name=model_name,

1139 provider=provider,

1140 search_tool=search_tool,

1141 n_trials=n_trials,

1142 metric_weights=metric_weights,

1143 optimization_metrics=["speed", "quality"],

1144 progress_callback=progress_callback,

1145 benchmark_weights=benchmark_weights,

1146 )

1147

1148

1149def optimize_for_quality(

1150 query: str,

1151 n_trials: int = 30,

1152 output_dir: str = str(Path("data") / "optimization_results"),

1153 model_name: Optional[str] = None,

1154 provider: Optional[str] = None,

1155 search_tool: Optional[str] = None,

1156 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,

1157 benchmark_weights: Optional[Dict[str, float]] = None,

1158) -> Tuple[Dict[str, Any], float]:

1159 """

1160 Optimize parameters with a focus on result quality.

1161

1162 Args:

1163 query: The research query to use for all experiments

1164 n_trials: Number of parameter combinations to try

1165 output_dir: Directory to save optimization results

1166 model_name: Name of the LLM model to use

1167 provider: LLM provider

1168 search_tool: Search engine to use

1169 progress_callback: Optional callback for progress updates

1170 benchmark_weights: Dictionary mapping benchmark types to weights

1171 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})

1172 If None, only SimpleQA is used with weight 1.0

1173

1174 Returns:

1175 Tuple of (best_parameters, best_score)

1176 """

1177 # Quality-focused weights

1178 metric_weights = {"quality": 0.9, "speed": 0.1}

1179

1180 return optimize_parameters(

1181 query=query,

1182 output_dir=output_dir,

1183 model_name=model_name,

1184 provider=provider,

1185 search_tool=search_tool,

1186 n_trials=n_trials,

1187 metric_weights=metric_weights,

1188 optimization_metrics=["quality", "speed"],

1189 progress_callback=progress_callback,

1190 benchmark_weights=benchmark_weights,

1191 )

1192

1193

1194def optimize_for_efficiency(

1195 query: str,

1196 n_trials: int = 25,

1197 output_dir: str = str(Path("data") / "optimization_results"),

1198 model_name: Optional[str] = None,

1199 provider: Optional[str] = None,

1200 search_tool: Optional[str] = None,

1201 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,

1202 benchmark_weights: Optional[Dict[str, float]] = None,

1203) -> Tuple[Dict[str, Any], float]:

1204 """

1205 Optimize parameters with a focus on resource efficiency.

1206

1207 Args:

1208 query: The research query to use for all experiments

1209 n_trials: Number of parameter combinations to try

1210 output_dir: Directory to save optimization results

1211 model_name: Name of the LLM model to use

1212 provider: LLM provider

1213 search_tool: Search engine to use

1214 progress_callback: Optional callback for progress updates

1215 benchmark_weights: Dictionary mapping benchmark types to weights

1216 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})

1217 If None, only SimpleQA is used with weight 1.0

1218

1219 Returns:

1220 Tuple of (best_parameters, best_score)

1221 """

1222 # Balance of quality, speed and resource usage

1223 metric_weights = {"quality": 0.4, "speed": 0.3, "resource": 0.3}

1224

1225 return optimize_parameters(

1226 query=query,

1227 output_dir=output_dir,

1228 model_name=model_name,

1229 provider=provider,

1230 search_tool=search_tool,

1231 n_trials=n_trials,

1232 metric_weights=metric_weights,

1233 optimization_metrics=["quality", "speed", "resource"],

1234 progress_callback=progress_callback,

1235 benchmark_weights=benchmark_weights,

1236 )

Coverage for src / local_deep_research / benchmarks / optimization / optuna_optimizer.py: 65%

346 statements