Coverage for src / local_deep_research / benchmarks / optimization / optuna_optimizer.py: 9%

346 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Optuna-based parameter optimizer for Local Deep Research. 

3 

4This module provides the core optimization functionality using Optuna 

5to find optimal parameters for the research system, balancing quality 

6and performance metrics. 

7""" 

8 

9import os 

10from pathlib import Path 

11import time 

12from datetime import datetime, UTC 

13from functools import partial 

14from typing import Any, Callable, Dict, List, Optional, Tuple 

15 

16import joblib 

17import numpy as np 

18import optuna 

19from optuna.visualization import ( 

20 plot_contour, 

21 plot_optimization_history, 

22 plot_param_importances, 

23 plot_slice, 

24) 

25 

26from local_deep_research.benchmarks.efficiency.speed_profiler import ( 

27 SpeedProfiler, 

28) 

29from local_deep_research.security import sanitize_data 

30from loguru import logger 

31 

32from local_deep_research.benchmarks.evaluators import ( 

33 CompositeBenchmarkEvaluator, 

34) 

35 

36# Import benchmark evaluator components 

37 

38# Try to import visualization libraries, but don't fail if not available 

39try: 

40 import matplotlib.pyplot as plt 

41 from matplotlib.lines import Line2D 

42 

43 # We'll use matplotlib for plotting visualization results 

44 

45 PLOTTING_AVAILABLE = True 

46except ImportError: 

47 PLOTTING_AVAILABLE = False 

48 logger.warning("Matplotlib not available, visualization will be limited") 

49 

50 

51class OptunaOptimizer: 

52 """ 

53 Optimize parameters for Local Deep Research using Optuna. 

54 

55 This class provides functionality to: 

56 1. Define search spaces for parameter optimization 

57 2. Evaluate parameter combinations using objective functions 

58 3. Find optimal parameters via Optuna 

59 4. Visualize and analyze optimization results 

60 """ 

61 

62 def __init__( 

63 self, 

64 base_query: str, 

65 output_dir: str = "optimization_results", 

66 model_name: Optional[str] = None, 

67 provider: Optional[str] = None, 

68 search_tool: Optional[str] = None, 

69 temperature: float = 0.7, 

70 n_trials: int = 30, 

71 timeout: Optional[int] = None, 

72 n_jobs: int = 1, 

73 study_name: Optional[str] = None, 

74 optimization_metrics: Optional[List[str]] = None, 

75 metric_weights: Optional[Dict[str, float]] = None, 

76 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

77 benchmark_weights: Optional[Dict[str, float]] = None, 

78 ): 

79 """ 

80 Initialize the optimizer. 

81 

82 Args: 

83 base_query: The research query to use for all experiments 

84 output_dir: Directory to save optimization results 

85 model_name: Name of the LLM model to use 

86 provider: LLM provider 

87 search_tool: Search engine to use 

88 temperature: LLM temperature 

89 n_trials: Number of parameter combinations to try 

90 timeout: Maximum seconds to run optimization (None for no limit) 

91 n_jobs: Number of parallel jobs for optimization 

92 study_name: Name of the Optuna study 

93 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"]) 

94 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4}) 

95 progress_callback: Optional callback for progress updates 

96 benchmark_weights: Dictionary mapping benchmark types to weights 

97 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

98 If None, only SimpleQA is used with weight 1.0 

99 """ 

100 self.base_query = base_query 

101 self.output_dir = output_dir 

102 self.model_name = model_name 

103 self.provider = provider 

104 self.search_tool = search_tool 

105 self.temperature = temperature 

106 self.n_trials = n_trials 

107 self.timeout = timeout 

108 self.n_jobs = n_jobs 

109 self.optimization_metrics = optimization_metrics or ["quality", "speed"] 

110 self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4} 

111 self.progress_callback = progress_callback 

112 

113 # Initialize benchmark evaluator with weights 

114 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0} 

115 self.benchmark_evaluator = CompositeBenchmarkEvaluator( 

116 self.benchmark_weights 

117 ) 

118 

119 # Normalize weights to sum to 1.0 

120 total_weight = sum(self.metric_weights.values()) 

121 if total_weight > 0: 

122 self.metric_weights = { 

123 k: v / total_weight for k, v in self.metric_weights.items() 

124 } 

125 

126 # Generate a unique study name if not provided 

127 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

128 self.study_name = study_name or f"ldr_opt_{timestamp}" 

129 

130 # Create output directory 

131 os.makedirs(output_dir, exist_ok=True) 

132 

133 # Store the trial history for analysis 

134 self.trials_history = [] 

135 

136 # Storage for the best parameters and study 

137 self.best_params = None 

138 self.study = None 

139 

140 def optimize( 

141 self, param_space: Optional[Dict[str, Any]] = None 

142 ) -> Tuple[Dict[str, Any], float]: 

143 """ 

144 Run the optimization process using Optuna. 

145 

146 Args: 

147 param_space: Dictionary defining parameter search spaces 

148 (if None, use default spaces) 

149 

150 Returns: 

151 Tuple containing (best_parameters, best_score) 

152 """ 

153 param_space = param_space or self._get_default_param_space() 

154 

155 # Create a study object 

156 storage_name = f"sqlite:///{self.output_dir}/{self.study_name}.db" 

157 self.study = optuna.create_study( 

158 study_name=self.study_name, 

159 storage=storage_name, 

160 load_if_exists=True, 

161 direction="maximize", 

162 sampler=optuna.samplers.TPESampler(seed=42), 

163 ) 

164 

165 # Create partial function with param_space 

166 objective = partial(self._objective, param_space=param_space) 

167 

168 # Log optimization start 

169 logger.info( 

170 f"Starting optimization with {self.n_trials} trials, {self.n_jobs} parallel jobs" 

171 ) 

172 logger.info(f"Parameter space: {param_space}") 

173 logger.info(f"Metric weights: {self.metric_weights}") 

174 logger.info(f"Benchmark weights: {self.benchmark_weights}") 

175 

176 # Initialize progress tracking 

177 if self.progress_callback: 

178 self.progress_callback( 

179 0, 

180 self.n_trials, 

181 { 

182 "status": "starting", 

183 "stage": "initialization", 

184 "trials_completed": 0, 

185 "total_trials": self.n_trials, 

186 }, 

187 ) 

188 

189 try: 

190 # Run optimization 

191 self.study.optimize( 

192 objective, 

193 n_trials=self.n_trials, 

194 timeout=self.timeout, 

195 n_jobs=self.n_jobs, 

196 callbacks=[self._optimization_callback], 

197 show_progress_bar=True, 

198 ) 

199 

200 # Store best parameters 

201 self.best_params = self.study.best_params 

202 

203 # Save the results 

204 self._save_results() 

205 

206 # Create visualizations 

207 self._create_visualizations() 

208 

209 logger.info( 

210 f"Optimization complete. Best parameters: {self.best_params}" 

211 ) 

212 logger.info(f"Best value: {self.study.best_value}") 

213 

214 # Report completion 

215 if self.progress_callback: 

216 self.progress_callback( 

217 self.n_trials, 

218 self.n_trials, 

219 { 

220 "status": "completed", 

221 "stage": "finished", 

222 "trials_completed": len(self.study.trials), 

223 "total_trials": self.n_trials, 

224 "best_params": self.best_params, 

225 "best_value": self.study.best_value, 

226 }, 

227 ) 

228 

229 return self.best_params, self.study.best_value 

230 

231 except KeyboardInterrupt: 

232 logger.info("Optimization interrupted by user") 

233 # Still save what we have 

234 self._save_results() 

235 self._create_visualizations() 

236 

237 # Report interruption 

238 if self.progress_callback: 

239 self.progress_callback( 

240 len(self.study.trials), 

241 self.n_trials, 

242 { 

243 "status": "interrupted", 

244 "stage": "interrupted", 

245 "trials_completed": len(self.study.trials), 

246 "total_trials": self.n_trials, 

247 "best_params": self.study.best_params, 

248 "best_value": self.study.best_value, 

249 }, 

250 ) 

251 

252 return self.study.best_params, self.study.best_value 

253 

254 def _get_default_param_space(self) -> Dict[str, Any]: 

255 """ 

256 Get default parameter search space. 

257 

258 Returns: 

259 Dictionary defining the default parameter search spaces 

260 """ 

261 return { 

262 "iterations": { 

263 "type": "int", 

264 "low": 1, 

265 "high": 5, 

266 "step": 1, 

267 }, 

268 "questions_per_iteration": { 

269 "type": "int", 

270 "low": 1, 

271 "high": 5, 

272 "step": 1, 

273 }, 

274 "search_strategy": { 

275 "type": "categorical", 

276 "choices": [ 

277 "iterdrag", 

278 "standard", 

279 "rapid", 

280 "parallel", 

281 "source_based", 

282 ], 

283 }, 

284 "max_results": { 

285 "type": "int", 

286 "low": 10, 

287 "high": 100, 

288 "step": 10, 

289 }, 

290 } 

291 

292 def _objective( 

293 self, trial: optuna.Trial, param_space: Dict[str, Any] 

294 ) -> float: 

295 """ 

296 Objective function for Optuna optimization. 

297 

298 Args: 

299 trial: Optuna trial object 

300 param_space: Dictionary defining parameter search spaces 

301 

302 Returns: 

303 Score to maximize 

304 """ 

305 # Generate parameters for this trial 

306 params = {} 

307 for param_name, param_config in param_space.items(): 

308 param_type = param_config["type"] 

309 

310 if param_type == "int": 

311 params[param_name] = trial.suggest_int( 

312 param_name, 

313 param_config["low"], 

314 param_config["high"], 

315 step=param_config.get("step", 1), 

316 ) 

317 elif param_type == "float": 

318 params[param_name] = trial.suggest_float( 

319 param_name, 

320 param_config["low"], 

321 param_config["high"], 

322 step=param_config.get("step"), 

323 log=param_config.get("log", False), 

324 ) 

325 elif param_type == "categorical": 

326 params[param_name] = trial.suggest_categorical( 

327 param_name, param_config["choices"] 

328 ) 

329 

330 # Log the trial parameters 

331 logger.info(f"Trial {trial.number}: {params}") 

332 

333 # Update progress callback if available 

334 if self.progress_callback: 

335 self.progress_callback( 

336 trial.number, 

337 self.n_trials, 

338 { 

339 "status": "running", 

340 "stage": "trial_started", 

341 "trial_number": trial.number, 

342 "params": params, 

343 "trials_completed": trial.number, 

344 "total_trials": self.n_trials, 

345 }, 

346 ) 

347 

348 # Run an experiment with these parameters 

349 try: 

350 start_time = time.time() 

351 result = self._run_experiment(params) 

352 duration = time.time() - start_time 

353 

354 # Store details about the trial 

355 trial_info = { 

356 "trial_number": trial.number, 

357 "params": params, 

358 "result": result, 

359 "score": result.get("score", 0), 

360 "duration": duration, 

361 "timestamp": datetime.now(UTC).isoformat(), 

362 } 

363 self.trials_history.append(trial_info) 

364 

365 # Update callback with results 

366 if self.progress_callback: 

367 self.progress_callback( 

368 trial.number, 

369 self.n_trials, 

370 { 

371 "status": "completed", 

372 "stage": "trial_completed", 

373 "trial_number": trial.number, 

374 "params": params, 

375 "score": result.get("score", 0), 

376 "trials_completed": trial.number + 1, 

377 "total_trials": self.n_trials, 

378 }, 

379 ) 

380 

381 logger.info( 

382 f"Trial {trial.number} completed: {params}, score: {result['score']:.4f}" 

383 ) 

384 

385 return result["score"] 

386 except Exception as e: 

387 logger.exception(f"Error in trial {trial.number}: {e!s}") 

388 

389 # Update callback with error 

390 if self.progress_callback: 

391 self.progress_callback( 

392 trial.number, 

393 self.n_trials, 

394 { 

395 "status": "error", 

396 "stage": "trial_error", 

397 "trial_number": trial.number, 

398 "params": params, 

399 "error": str(e), 

400 "trials_completed": trial.number, 

401 "total_trials": self.n_trials, 

402 }, 

403 ) 

404 

405 return float("-inf") # Return a very low score for failed trials 

406 

407 def _run_experiment(self, params: Dict[str, Any]) -> Dict[str, Any]: 

408 """ 

409 Run a single experiment with the given parameters. 

410 

411 Args: 

412 params: Dictionary of parameters to test 

413 

414 Returns: 

415 Results dictionary with metrics and score 

416 """ 

417 # Extract parameters 

418 iterations = params.get("iterations", 2) 

419 questions_per_iteration = params.get("questions_per_iteration", 2) 

420 search_strategy = params.get("search_strategy", "iterdrag") 

421 max_results = params.get("max_results", 50) 

422 

423 # Initialize profiling tools 

424 speed_profiler = SpeedProfiler() 

425 

426 # Start profiling 

427 speed_profiler.start() 

428 

429 try: 

430 # Create system configuration 

431 system_config = { 

432 "iterations": iterations, 

433 "questions_per_iteration": questions_per_iteration, 

434 "search_strategy": search_strategy, 

435 "search_tool": self.search_tool, 

436 "max_results": max_results, 

437 "model_name": self.model_name, 

438 "provider": self.provider, 

439 } 

440 

441 # Evaluate quality using composite benchmark evaluator 

442 # Use a small number of examples for efficiency 

443 benchmark_dir = str(Path(self.output_dir) / "benchmark_temp") 

444 quality_results = self.benchmark_evaluator.evaluate( 

445 system_config=system_config, 

446 num_examples=5, # Small number for optimization efficiency 

447 output_dir=benchmark_dir, 

448 ) 

449 

450 # Stop timing 

451 speed_profiler.stop() 

452 timing_results = speed_profiler.get_summary() 

453 

454 # Extract key metrics 

455 quality_score = quality_results.get("quality_score", 0.0) 

456 benchmark_results = quality_results.get("benchmark_results", {}) 

457 

458 # Speed score: convert duration to a 0-1 score where faster is better 

459 # Using a reasonable threshold (e.g., 180 seconds for 5 examples) 

460 # Below this threshold: high score, above it: declining score 

461 total_duration = timing_results.get("total_duration", 180) 

462 speed_score = max(0.0, min(1.0, 1.0 - (total_duration - 60) / 180)) 

463 

464 # Calculate combined score based on weights 

465 combined_score = ( 

466 self.metric_weights.get("quality", 0.6) * quality_score 

467 + self.metric_weights.get("speed", 0.4) * speed_score 

468 ) 

469 

470 # Return streamlined results 

471 return { 

472 "quality_score": quality_score, 

473 "benchmark_results": benchmark_results, 

474 "speed_score": speed_score, 

475 "total_duration": total_duration, 

476 "score": combined_score, 

477 "success": True, 

478 } 

479 

480 except Exception as e: 

481 # Stop profiling on error 

482 speed_profiler.stop() 

483 

484 # Log error 

485 logger.exception(f"Error in experiment: {e!s}") 

486 

487 # Return error information 

488 return {"error": str(e), "score": 0.0, "success": False} 

489 

490 def _optimization_callback(self, study: optuna.Study, trial: optuna.Trial): 

491 """ 

492 Callback for the Optuna optimization process. 

493 

494 Args: 

495 study: Optuna study object 

496 trial: Current trial 

497 """ 

498 # Save intermediate results periodically 

499 if trial.number % 10 == 0 and trial.number > 0: 

500 self._save_results() 

501 self._create_quick_visualizations() 

502 

503 def _save_results(self): 

504 """Save the optimization results to disk.""" 

505 # Create a timestamp for filenames 

506 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

507 

508 # Save trial history 

509 from ...security.file_write_verifier import write_json_verified 

510 

511 history_file = str( 

512 Path(self.output_dir) / f"{self.study_name}_history.json" 

513 ) 

514 

515 # Convert numpy values to native Python types for JSON serialization 

516 clean_history = [] 

517 for trial in self.trials_history: 

518 clean_trial = {} 

519 for k, v in trial.items(): 

520 if isinstance(v, dict): 

521 clean_trial[k] = { 

522 dk: (float(dv) if isinstance(dv, np.number) else dv) 

523 for dk, dv in v.items() 

524 } 

525 elif isinstance(v, np.number): 

526 clean_trial[k] = float(v) 

527 else: 

528 clean_trial[k] = v 

529 clean_history.append(clean_trial) 

530 

531 # Sanitize sensitive data before writing to disk 

532 sanitized_history = sanitize_data(clean_history) 

533 

534 write_json_verified( 

535 history_file, 

536 sanitized_history, 

537 "benchmark.allow_file_output", 

538 context="optimization history", 

539 ) 

540 

541 # Save current best parameters 

542 if ( 

543 self.study 

544 and hasattr(self.study, "best_params") 

545 and self.study.best_params 

546 ): 

547 best_params_file = str( 

548 Path(self.output_dir) / f"{self.study_name}_best_params.json" 

549 ) 

550 

551 best_params_data = { 

552 "best_params": self.study.best_params, 

553 "best_value": float(self.study.best_value), 

554 "n_trials": len(self.study.trials), 

555 "timestamp": timestamp, 

556 "base_query": self.base_query, 

557 "model_name": self.model_name, 

558 "provider": self.provider, 

559 "search_tool": self.search_tool, 

560 "metric_weights": self.metric_weights, 

561 "benchmark_weights": self.benchmark_weights, 

562 } 

563 

564 # Sanitize sensitive data before writing to disk 

565 sanitized_best_params = sanitize_data(best_params_data) 

566 

567 write_json_verified( 

568 best_params_file, 

569 sanitized_best_params, 

570 "benchmark.allow_file_output", 

571 context="optimization best params", 

572 ) 

573 

574 # Save the Optuna study 

575 if self.study: 

576 study_file = str( 

577 Path(self.output_dir) / f"{self.study_name}_study.pkl" 

578 ) 

579 joblib.dump(self.study, study_file) 

580 

581 logger.info(f"Results saved to {self.output_dir}") 

582 

583 def _create_visualizations(self): 

584 """Create and save comprehensive visualizations of the optimization results.""" 

585 if not PLOTTING_AVAILABLE: 

586 logger.warning( 

587 "Matplotlib not available, skipping visualization creation" 

588 ) 

589 return 

590 

591 if not self.study or len(self.study.trials) < 2: 

592 logger.warning("Not enough trials to create visualizations") 

593 return 

594 

595 # Create directory for visualizations 

596 viz_dir = Path(self.output_dir) / "visualizations" 

597 viz_dir.mkdir(parents=True, exist_ok=True) 

598 viz_dir = str(viz_dir) 

599 

600 # Create Optuna visualizations 

601 self._create_optuna_visualizations(viz_dir) 

602 

603 # Create custom visualizations 

604 self._create_custom_visualizations(viz_dir) 

605 

606 logger.info(f"Visualizations saved to {viz_dir}") 

607 

608 def _create_quick_visualizations(self): 

609 """Create a smaller set of visualizations for intermediate progress.""" 

610 if ( 

611 not PLOTTING_AVAILABLE 

612 or not self.study 

613 or len(self.study.trials) < 2 

614 ): 

615 return 

616 

617 # Create directory for visualizations 

618 viz_dir = Path(self.output_dir) / "visualizations" 

619 viz_dir.mkdir(parents=True, exist_ok=True) 

620 viz_dir = str(viz_dir) 

621 

622 # Create optimization history only (faster than full visualization) 

623 try: 

624 fig = plot_optimization_history(self.study) 

625 fig.write_image( 

626 str( 

627 Path(viz_dir) 

628 / f"{self.study_name}_optimization_history_current.png" 

629 ) 

630 ) 

631 except Exception as e: 

632 logger.exception(f"Error creating optimization history plot: {e!s}") 

633 

634 def _create_optuna_visualizations(self, viz_dir: str): 

635 """ 

636 Create and save Optuna's built-in visualizations. 

637 

638 Args: 

639 viz_dir: Directory to save visualizations 

640 """ 

641 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

642 

643 # 1. Optimization history 

644 try: 

645 fig = plot_optimization_history(self.study) 

646 fig.write_image( 

647 str( 

648 Path(viz_dir) 

649 / f"{self.study_name}_optimization_history_{timestamp}.png" 

650 ) 

651 ) 

652 except Exception as e: 

653 logger.exception(f"Error creating optimization history plot: {e!s}") 

654 

655 # 2. Parameter importances 

656 try: 

657 fig = plot_param_importances(self.study) 

658 fig.write_image( 

659 str( 

660 Path(viz_dir) 

661 / f"{self.study_name}_param_importances_{timestamp}.png" 

662 ) 

663 ) 

664 except Exception as e: 

665 logger.exception( 

666 f"Error creating parameter importances plot: {e!s}" 

667 ) 

668 

669 # 3. Slice plot for each parameter 

670 try: 

671 for param_name in self.study.best_params.keys(): 

672 fig = plot_slice(self.study, [param_name]) 

673 fig.write_image( 

674 str( 

675 Path(viz_dir) 

676 / f"{self.study_name}_slice_{param_name}_{timestamp}.png" 

677 ) 

678 ) 

679 except Exception as e: 

680 logger.exception(f"Error creating slice plots: {e!s}") 

681 

682 # 4. Contour plots for important parameter pairs 

683 try: 

684 # Get all parameter names 

685 param_names = list(self.study.best_params.keys()) 

686 

687 # Create contour plots for each pair 

688 for i in range(len(param_names)): 

689 for j in range(i + 1, len(param_names)): 

690 try: 

691 fig = plot_contour( 

692 self.study, params=[param_names[i], param_names[j]] 

693 ) 

694 fig.write_image( 

695 str( 

696 Path(viz_dir) 

697 / f"{self.study_name}_contour_{param_names[i]}_{param_names[j]}_{timestamp}.png" 

698 ) 

699 ) 

700 except Exception as e: 

701 logger.warning( 

702 f"Error creating contour plot for {param_names[i]} vs {param_names[j]}: {e!s}" 

703 ) 

704 except Exception as e: 

705 logger.exception(f"Error creating contour plots: {e!s}") 

706 

707 def _create_custom_visualizations(self, viz_dir: str): 

708 """ 

709 Create custom visualizations based on trial history. 

710 

711 Args: 

712 viz_dir: Directory to save visualizations 

713 """ 

714 if not self.trials_history: 

715 return 

716 

717 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

718 

719 # Create quality vs speed plot 

720 self._create_quality_vs_speed_plot(viz_dir, timestamp) 

721 

722 # Create parameter evolution plots 

723 self._create_parameter_evolution_plots(viz_dir, timestamp) 

724 

725 # Create trial duration vs score plot 

726 self._create_duration_vs_score_plot(viz_dir, timestamp) 

727 

728 def _create_quality_vs_speed_plot(self, viz_dir: str, timestamp: str): 

729 """Create a plot showing quality vs. speed trade-off.""" 

730 if not self.trials_history: 

731 return 

732 

733 # Extract data from successful trials 

734 successful_trials = [ 

735 t 

736 for t in self.trials_history 

737 if t.get("result", {}).get("success", False) 

738 ] 

739 

740 if not successful_trials: 

741 logger.warning("No successful trials for visualization") 

742 return 

743 

744 try: 

745 plt.figure(figsize=(10, 8)) 

746 

747 # Extract metrics 

748 quality_scores = [] 

749 speed_scores = [] 

750 labels = [] 

751 iterations_values = [] 

752 questions_values = [] 

753 

754 for trial in successful_trials: 

755 result = trial["result"] 

756 quality = result.get("quality_score", 0) 

757 speed = result.get("speed_score", 0) 

758 iterations = trial["params"].get("iterations", 0) 

759 questions = trial["params"].get("questions_per_iteration", 0) 

760 

761 quality_scores.append(quality) 

762 speed_scores.append(speed) 

763 labels.append(f"Trial {trial['trial_number']}") 

764 iterations_values.append(iterations) 

765 questions_values.append(questions) 

766 

767 # Create scatter plot with size based on iterations*questions 

768 sizes = [ 

769 i * q * 5 

770 for i, q in zip( 

771 iterations_values, questions_values, strict=False 

772 ) 

773 ] 

774 scatter = plt.scatter( 

775 quality_scores, 

776 speed_scores, 

777 s=sizes, 

778 alpha=0.7, 

779 c=range(len(quality_scores)), 

780 cmap="viridis", 

781 ) 

782 

783 # Highlight best trial 

784 best_trial = max( 

785 successful_trials, 

786 key=lambda x: x.get("result", {}).get("score", 0), 

787 ) 

788 best_quality = best_trial["result"].get("quality_score", 0) 

789 best_speed = best_trial["result"].get("speed_score", 0) 

790 best_iter = best_trial["params"].get("iterations", 0) 

791 best_questions = best_trial["params"].get( 

792 "questions_per_iteration", 0 

793 ) 

794 

795 plt.scatter( 

796 [best_quality], 

797 [best_speed], 

798 s=200, 

799 facecolors="none", 

800 edgecolors="red", 

801 linewidth=2, 

802 label=f"Best: {best_iter}×{best_questions}", 

803 ) 

804 

805 # Add annotations for key points 

806 for i, (q, s, label) in enumerate( 

807 zip(quality_scores, speed_scores, labels, strict=False) 

808 ): 

809 if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points 

810 plt.annotate( 

811 f"{iterations_values[i]}×{questions_values[i]}", 

812 (q, s), 

813 xytext=(5, 5), 

814 textcoords="offset points", 

815 ) 

816 

817 # Add colorbar and labels 

818 cbar = plt.colorbar(scatter) 

819 cbar.set_label("Trial Progression") 

820 

821 # Add benchmark weight information 

822 weights_str = ", ".join( 

823 [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()] 

824 ) 

825 plt.title( 

826 f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}" 

827 ) 

828 plt.xlabel("Quality Score (Benchmark Accuracy)") 

829 plt.ylabel("Speed Score") 

830 plt.grid(True, linestyle="--", alpha=0.7) 

831 

832 # Add legend explaining size 

833 legend_elements = [ 

834 Line2D( 

835 [0], 

836 [0], 

837 marker="o", 

838 color="w", 

839 markerfacecolor="gray", 

840 markersize=np.sqrt(n * 5 / np.pi), 

841 label=f"{n} Total Questions", 

842 ) 

843 for n in [5, 10, 15, 20, 25] 

844 ] 

845 plt.legend(handles=legend_elements, title="Workload") 

846 

847 # Save the figure 

848 plt.tight_layout() 

849 plt.savefig( 

850 str( 

851 Path(viz_dir) 

852 / f"{self.study_name}_quality_vs_speed_{timestamp}.png" 

853 ) 

854 ) 

855 plt.close() 

856 except Exception as e: 

857 logger.exception(f"Error creating quality vs speed plot: {e!s}") 

858 

859 def _create_parameter_evolution_plots(self, viz_dir: str, timestamp: str): 

860 """Create plots showing how parameter values evolve over trials.""" 

861 try: 

862 successful_trials = [ 

863 t 

864 for t in self.trials_history 

865 if t.get("result", {}).get("success", False) 

866 ] 

867 

868 if not successful_trials or len(successful_trials) < 5: 

869 return 

870 

871 # Get key parameters 

872 main_params = list(successful_trials[0]["params"].keys()) 

873 

874 # For each parameter, plot its values over trials 

875 for param_name in main_params: 

876 plt.figure(figsize=(12, 6)) 

877 

878 trial_numbers = [] 

879 param_values = [] 

880 scores = [] 

881 

882 for trial in self.trials_history: 

883 if "params" in trial and param_name in trial["params"]: 

884 trial_numbers.append(trial["trial_number"]) 

885 param_values.append(trial["params"][param_name]) 

886 scores.append(trial.get("score", 0)) 

887 

888 # Create evolution plot 

889 scatter = plt.scatter( 

890 trial_numbers, 

891 param_values, 

892 c=scores, 

893 cmap="plasma", 

894 alpha=0.8, 

895 s=80, 

896 ) 

897 

898 # Add best trial marker 

899 best_trial_idx = scores.index(max(scores)) 

900 plt.scatter( 

901 [trial_numbers[best_trial_idx]], 

902 [param_values[best_trial_idx]], 

903 s=150, 

904 facecolors="none", 

905 edgecolors="red", 

906 linewidth=2, 

907 label=f"Best Value: {param_values[best_trial_idx]}", 

908 ) 

909 

910 # Add colorbar 

911 cbar = plt.colorbar(scatter) 

912 cbar.set_label("Score") 

913 

914 # Set chart properties 

915 plt.title(f"Evolution of {param_name} Values") 

916 plt.xlabel("Trial Number") 

917 plt.ylabel(param_name) 

918 plt.grid(True, linestyle="--", alpha=0.7) 

919 plt.legend() 

920 

921 # For categorical parameters, adjust y-axis 

922 if isinstance(param_values[0], str): 

923 unique_values = sorted(set(param_values)) 

924 plt.yticks(range(len(unique_values)), unique_values) 

925 

926 # Save the figure 

927 plt.tight_layout() 

928 plt.savefig( 

929 str( 

930 Path(viz_dir) 

931 / f"{self.study_name}_param_evolution_{param_name}_{timestamp}.png" 

932 ) 

933 ) 

934 plt.close() 

935 except Exception as e: 

936 logger.exception(f"Error creating parameter evolution plots: {e!s}") 

937 

938 def _create_duration_vs_score_plot(self, viz_dir: str, timestamp: str): 

939 """Create a plot showing trial duration vs score.""" 

940 try: 

941 plt.figure(figsize=(10, 6)) 

942 

943 successful_trials = [ 

944 t 

945 for t in self.trials_history 

946 if t.get("result", {}).get("success", False) 

947 ] 

948 

949 if not successful_trials: 

950 return 

951 

952 trial_durations = [] 

953 trial_scores = [] 

954 trial_iterations = [] 

955 trial_questions = [] 

956 

957 for trial in successful_trials: 

958 duration = trial.get("duration", 0) 

959 score = trial.get("score", 0) 

960 iterations = trial.get("params", {}).get("iterations", 1) 

961 questions = trial.get("params", {}).get( 

962 "questions_per_iteration", 1 

963 ) 

964 

965 trial_durations.append(duration) 

966 trial_scores.append(score) 

967 trial_iterations.append(iterations) 

968 trial_questions.append(questions) 

969 

970 # Total questions per trial 

971 total_questions = [ 

972 i * q 

973 for i, q in zip(trial_iterations, trial_questions, strict=False) 

974 ] 

975 

976 # Create scatter plot with size based on total questions 

977 plt.scatter( 

978 trial_durations, 

979 trial_scores, 

980 s=[ 

981 q * 5 for q in total_questions 

982 ], # Size based on total questions 

983 alpha=0.7, 

984 c=range(len(trial_durations)), 

985 cmap="viridis", 

986 ) 

987 

988 # Add labels 

989 plt.xlabel("Trial Duration (seconds)") 

990 plt.ylabel("Score") 

991 plt.title("Trial Duration vs. Score") 

992 plt.grid(True, linestyle="--", alpha=0.7) 

993 

994 # Add trial number annotations for selected points 

995 for i, (d, s) in enumerate( 

996 zip(trial_durations, trial_scores, strict=False) 

997 ): 

998 if ( 

999 i % max(1, len(trial_durations) // 5) == 0 

1000 ): # Annotate ~5 points 

1001 plt.annotate( 

1002 f"{trial_iterations[i]}×{trial_questions[i]}", 

1003 (d, s), 

1004 xytext=(5, 5), 

1005 textcoords="offset points", 

1006 ) 

1007 

1008 # Save the figure 

1009 plt.tight_layout() 

1010 plt.savefig( 

1011 str( 

1012 Path(viz_dir) 

1013 / f"{self.study_name}_duration_vs_score_{timestamp}.png" 

1014 ) 

1015 ) 

1016 plt.close() 

1017 except Exception as e: 

1018 logger.exception(f"Error creating duration vs score plot: {e!s}") 

1019 

1020 

1021def optimize_parameters( 

1022 query: str, 

1023 param_space: Optional[Dict[str, Any]] = None, 

1024 output_dir: str = str(Path("data") / "optimization_results"), 

1025 model_name: Optional[str] = None, 

1026 provider: Optional[str] = None, 

1027 search_tool: Optional[str] = None, 

1028 temperature: float = 0.7, 

1029 n_trials: int = 30, 

1030 timeout: Optional[int] = None, 

1031 n_jobs: int = 1, 

1032 study_name: Optional[str] = None, 

1033 optimization_metrics: Optional[List[str]] = None, 

1034 metric_weights: Optional[Dict[str, float]] = None, 

1035 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1036 benchmark_weights: Optional[Dict[str, float]] = None, 

1037) -> Tuple[Dict[str, Any], float]: 

1038 """ 

1039 Optimize parameters for Local Deep Research. 

1040 

1041 Args: 

1042 query: The research query to use for all experiments 

1043 param_space: Dictionary defining parameter search spaces (optional) 

1044 output_dir: Directory to save optimization results 

1045 model_name: Name of the LLM model to use 

1046 provider: LLM provider 

1047 search_tool: Search engine to use 

1048 temperature: LLM temperature 

1049 n_trials: Number of parameter combinations to try 

1050 timeout: Maximum seconds to run optimization (None for no limit) 

1051 n_jobs: Number of parallel jobs for optimization 

1052 study_name: Name of the Optuna study 

1053 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"]) 

1054 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4}) 

1055 progress_callback: Optional callback for progress updates 

1056 benchmark_weights: Dictionary mapping benchmark types to weights 

1057 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1058 If None, only SimpleQA is used with weight 1.0 

1059 

1060 Returns: 

1061 Tuple of (best_parameters, best_score) 

1062 """ 

1063 # Create optimizer 

1064 optimizer = OptunaOptimizer( 

1065 base_query=query, 

1066 output_dir=output_dir, 

1067 model_name=model_name, 

1068 provider=provider, 

1069 search_tool=search_tool, 

1070 temperature=temperature, 

1071 n_trials=n_trials, 

1072 timeout=timeout, 

1073 n_jobs=n_jobs, 

1074 study_name=study_name, 

1075 optimization_metrics=optimization_metrics, 

1076 metric_weights=metric_weights, 

1077 progress_callback=progress_callback, 

1078 benchmark_weights=benchmark_weights, 

1079 ) 

1080 

1081 # Run optimization 

1082 return optimizer.optimize(param_space) 

1083 

1084 

1085def optimize_for_speed( 

1086 query: str, 

1087 n_trials: int = 20, 

1088 output_dir: str = str(Path("data") / "optimization_results"), 

1089 model_name: Optional[str] = None, 

1090 provider: Optional[str] = None, 

1091 search_tool: Optional[str] = None, 

1092 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1093 benchmark_weights: Optional[Dict[str, float]] = None, 

1094) -> Tuple[Dict[str, Any], float]: 

1095 """ 

1096 Optimize parameters with a focus on speed performance. 

1097 

1098 Args: 

1099 query: The research query to use for all experiments 

1100 n_trials: Number of parameter combinations to try 

1101 output_dir: Directory to save optimization results 

1102 model_name: Name of the LLM model to use 

1103 provider: LLM provider 

1104 search_tool: Search engine to use 

1105 progress_callback: Optional callback for progress updates 

1106 benchmark_weights: Dictionary mapping benchmark types to weights 

1107 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1108 If None, only SimpleQA is used with weight 1.0 

1109 

1110 Returns: 

1111 Tuple of (best_parameters, best_score) 

1112 """ 

1113 # Focus on speed with reduced parameter space 

1114 param_space = { 

1115 "iterations": { 

1116 "type": "int", 

1117 "low": 1, 

1118 "high": 3, 

1119 "step": 1, 

1120 }, 

1121 "questions_per_iteration": { 

1122 "type": "int", 

1123 "low": 1, 

1124 "high": 3, 

1125 "step": 1, 

1126 }, 

1127 "search_strategy": { 

1128 "type": "categorical", 

1129 "choices": ["rapid", "parallel", "source_based"], 

1130 }, 

1131 } 

1132 

1133 # Speed-focused weights 

1134 metric_weights = {"speed": 0.8, "quality": 0.2} 

1135 

1136 return optimize_parameters( 

1137 query=query, 

1138 param_space=param_space, 

1139 output_dir=output_dir, 

1140 model_name=model_name, 

1141 provider=provider, 

1142 search_tool=search_tool, 

1143 n_trials=n_trials, 

1144 metric_weights=metric_weights, 

1145 optimization_metrics=["speed", "quality"], 

1146 progress_callback=progress_callback, 

1147 benchmark_weights=benchmark_weights, 

1148 ) 

1149 

1150 

1151def optimize_for_quality( 

1152 query: str, 

1153 n_trials: int = 30, 

1154 output_dir: str = str(Path("data") / "optimization_results"), 

1155 model_name: Optional[str] = None, 

1156 provider: Optional[str] = None, 

1157 search_tool: Optional[str] = None, 

1158 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1159 benchmark_weights: Optional[Dict[str, float]] = None, 

1160) -> Tuple[Dict[str, Any], float]: 

1161 """ 

1162 Optimize parameters with a focus on result quality. 

1163 

1164 Args: 

1165 query: The research query to use for all experiments 

1166 n_trials: Number of parameter combinations to try 

1167 output_dir: Directory to save optimization results 

1168 model_name: Name of the LLM model to use 

1169 provider: LLM provider 

1170 search_tool: Search engine to use 

1171 progress_callback: Optional callback for progress updates 

1172 benchmark_weights: Dictionary mapping benchmark types to weights 

1173 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1174 If None, only SimpleQA is used with weight 1.0 

1175 

1176 Returns: 

1177 Tuple of (best_parameters, best_score) 

1178 """ 

1179 # Quality-focused weights 

1180 metric_weights = {"quality": 0.9, "speed": 0.1} 

1181 

1182 return optimize_parameters( 

1183 query=query, 

1184 output_dir=output_dir, 

1185 model_name=model_name, 

1186 provider=provider, 

1187 search_tool=search_tool, 

1188 n_trials=n_trials, 

1189 metric_weights=metric_weights, 

1190 optimization_metrics=["quality", "speed"], 

1191 progress_callback=progress_callback, 

1192 benchmark_weights=benchmark_weights, 

1193 ) 

1194 

1195 

1196def optimize_for_efficiency( 

1197 query: str, 

1198 n_trials: int = 25, 

1199 output_dir: str = str(Path("data") / "optimization_results"), 

1200 model_name: Optional[str] = None, 

1201 provider: Optional[str] = None, 

1202 search_tool: Optional[str] = None, 

1203 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1204 benchmark_weights: Optional[Dict[str, float]] = None, 

1205) -> Tuple[Dict[str, Any], float]: 

1206 """ 

1207 Optimize parameters with a focus on resource efficiency. 

1208 

1209 Args: 

1210 query: The research query to use for all experiments 

1211 n_trials: Number of parameter combinations to try 

1212 output_dir: Directory to save optimization results 

1213 model_name: Name of the LLM model to use 

1214 provider: LLM provider 

1215 search_tool: Search engine to use 

1216 progress_callback: Optional callback for progress updates 

1217 benchmark_weights: Dictionary mapping benchmark types to weights 

1218 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1219 If None, only SimpleQA is used with weight 1.0 

1220 

1221 Returns: 

1222 Tuple of (best_parameters, best_score) 

1223 """ 

1224 # Balance of quality, speed and resource usage 

1225 metric_weights = {"quality": 0.4, "speed": 0.3, "resource": 0.3} 

1226 

1227 return optimize_parameters( 

1228 query=query, 

1229 output_dir=output_dir, 

1230 model_name=model_name, 

1231 provider=provider, 

1232 search_tool=search_tool, 

1233 n_trials=n_trials, 

1234 metric_weights=metric_weights, 

1235 optimization_metrics=["quality", "speed", "resource"], 

1236 progress_callback=progress_callback, 

1237 benchmark_weights=benchmark_weights, 

1238 )