Coverage for src / local_deep_research / benchmarks / optimization / optuna_optimizer.py: 65%

346 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Optuna-based parameter optimizer for Local Deep Research. 

3 

4This module provides the core optimization functionality using Optuna 

5to find optimal parameters for the research system, balancing quality 

6and performance metrics. 

7""" 

8 

9import os 

10from pathlib import Path 

11import time 

12from datetime import datetime, UTC 

13from functools import partial 

14from typing import Any, Callable, Dict, List, Optional, Tuple 

15 

16import joblib 

17import numpy as np 

18import optuna 

19from optuna.visualization import ( 

20 plot_contour, 

21 plot_optimization_history, 

22 plot_param_importances, 

23 plot_slice, 

24) 

25 

26from local_deep_research.benchmarks.efficiency.speed_profiler import ( 

27 SpeedProfiler, 

28) 

29from local_deep_research.security import sanitize_data 

30from loguru import logger 

31 

32from local_deep_research.benchmarks.evaluators import ( 

33 CompositeBenchmarkEvaluator, 

34) 

35 

36# Import benchmark evaluator components 

37 

38# Try to import visualization libraries, but don't fail if not available 

39try: 

40 import matplotlib.pyplot as plt 

41 from matplotlib.lines import Line2D 

42 

43 # We'll use matplotlib for plotting visualization results 

44 

45 PLOTTING_AVAILABLE = True 

46except ImportError: 

47 PLOTTING_AVAILABLE = False 

48 logger.warning("Matplotlib not available, visualization will be limited") 

49 

50 

51class OptunaOptimizer: 

52 """ 

53 Optimize parameters for Local Deep Research using Optuna. 

54 

55 This class provides functionality to: 

56 1. Define search spaces for parameter optimization 

57 2. Evaluate parameter combinations using objective functions 

58 3. Find optimal parameters via Optuna 

59 4. Visualize and analyze optimization results 

60 """ 

61 

62 def __init__( 

63 self, 

64 base_query: str, 

65 output_dir: str = "optimization_results", 

66 model_name: Optional[str] = None, 

67 provider: Optional[str] = None, 

68 search_tool: Optional[str] = None, 

69 temperature: float = 0.7, 

70 n_trials: int = 30, 

71 timeout: Optional[int] = None, 

72 n_jobs: int = 1, 

73 study_name: Optional[str] = None, 

74 optimization_metrics: Optional[List[str]] = None, 

75 metric_weights: Optional[Dict[str, float]] = None, 

76 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

77 benchmark_weights: Optional[Dict[str, float]] = None, 

78 ): 

79 """ 

80 Initialize the optimizer. 

81 

82 Args: 

83 base_query: The research query to use for all experiments 

84 output_dir: Directory to save optimization results 

85 model_name: Name of the LLM model to use 

86 provider: LLM provider 

87 search_tool: Search engine to use 

88 temperature: LLM temperature 

89 n_trials: Number of parameter combinations to try 

90 timeout: Maximum seconds to run optimization (None for no limit) 

91 n_jobs: Number of parallel jobs for optimization 

92 study_name: Name of the Optuna study 

93 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"]) 

94 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4}) 

95 progress_callback: Optional callback for progress updates 

96 benchmark_weights: Dictionary mapping benchmark types to weights 

97 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

98 If None, only SimpleQA is used with weight 1.0 

99 """ 

100 self.base_query = base_query 

101 self.output_dir = output_dir 

102 self.model_name = model_name 

103 self.provider = provider 

104 self.search_tool = search_tool 

105 self.temperature = temperature 

106 self.n_trials = n_trials 

107 self.timeout = timeout 

108 self.n_jobs = n_jobs 

109 self.optimization_metrics = optimization_metrics or ["quality", "speed"] 

110 self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4} 

111 self.progress_callback = progress_callback 

112 

113 # Initialize benchmark evaluator with weights 

114 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0} 

115 self.benchmark_evaluator = CompositeBenchmarkEvaluator( 

116 self.benchmark_weights 

117 ) 

118 

119 # Normalize weights to sum to 1.0 

120 total_weight = sum(self.metric_weights.values()) 

121 if total_weight > 0: 121 ↛ 127line 121 didn't jump to line 127 because the condition on line 121 was always true

122 self.metric_weights = { 

123 k: v / total_weight for k, v in self.metric_weights.items() 

124 } 

125 

126 # Generate a unique study name if not provided 

127 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

128 self.study_name = study_name or f"ldr_opt_{timestamp}" 

129 

130 # Create output directory 

131 os.makedirs(output_dir, exist_ok=True) 

132 

133 # Store the trial history for analysis 

134 self.trials_history = [] 

135 

136 # Storage for the best parameters and study 

137 self.best_params = None 

138 self.study = None 

139 

140 def optimize( 

141 self, param_space: Optional[Dict[str, Any]] = None 

142 ) -> Tuple[Dict[str, Any], float]: 

143 """ 

144 Run the optimization process using Optuna. 

145 

146 Args: 

147 param_space: Dictionary defining parameter search spaces 

148 (if None, use default spaces) 

149 

150 Returns: 

151 Tuple containing (best_parameters, best_score) 

152 """ 

153 param_space = param_space or self._get_default_param_space() 

154 

155 # Create a study object 

156 storage_name = f"sqlite:///{self.output_dir}/{self.study_name}.db" 

157 self.study = optuna.create_study( 

158 study_name=self.study_name, 

159 storage=storage_name, 

160 load_if_exists=True, 

161 direction="maximize", 

162 sampler=optuna.samplers.TPESampler(seed=42), 

163 ) 

164 

165 # Create partial function with param_space 

166 objective = partial(self._objective, param_space=param_space) 

167 

168 # Log optimization start 

169 logger.info( 

170 f"Starting optimization with {self.n_trials} trials, {self.n_jobs} parallel jobs" 

171 ) 

172 logger.info(f"Parameter space: {param_space}") 

173 logger.info(f"Metric weights: {self.metric_weights}") 

174 logger.info(f"Benchmark weights: {self.benchmark_weights}") 

175 

176 # Initialize progress tracking 

177 if self.progress_callback: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 self.progress_callback( 

179 0, 

180 self.n_trials, 

181 { 

182 "status": "starting", 

183 "stage": "initialization", 

184 "trials_completed": 0, 

185 "total_trials": self.n_trials, 

186 }, 

187 ) 

188 

189 try: 

190 # Run optimization 

191 self.study.optimize( 

192 objective, 

193 n_trials=self.n_trials, 

194 timeout=self.timeout, 

195 n_jobs=self.n_jobs, 

196 callbacks=[self._optimization_callback], 

197 show_progress_bar=True, 

198 ) 

199 

200 # Store best parameters 

201 self.best_params = self.study.best_params 

202 

203 # Save the results 

204 self._save_results() 

205 

206 # Create visualizations 

207 self._create_visualizations() 

208 

209 logger.info( 

210 f"Optimization complete. Best parameters: {self.best_params}" 

211 ) 

212 logger.info(f"Best value: {self.study.best_value}") 

213 

214 # Report completion 

215 if self.progress_callback: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 self.progress_callback( 

217 self.n_trials, 

218 self.n_trials, 

219 { 

220 "status": "completed", 

221 "stage": "finished", 

222 "trials_completed": len(self.study.trials), 

223 "total_trials": self.n_trials, 

224 "best_params": self.best_params, 

225 "best_value": self.study.best_value, 

226 }, 

227 ) 

228 

229 return self.best_params, self.study.best_value 

230 

231 except KeyboardInterrupt: 

232 logger.info("Optimization interrupted by user") 

233 # Still save what we have 

234 self._save_results() 

235 self._create_visualizations() 

236 

237 # Report interruption 

238 if self.progress_callback: 

239 self.progress_callback( 

240 len(self.study.trials), 

241 self.n_trials, 

242 { 

243 "status": "interrupted", 

244 "stage": "interrupted", 

245 "trials_completed": len(self.study.trials), 

246 "total_trials": self.n_trials, 

247 "best_params": self.study.best_params, 

248 "best_value": self.study.best_value, 

249 }, 

250 ) 

251 

252 return self.study.best_params, self.study.best_value 

253 

254 def _get_default_param_space(self) -> Dict[str, Any]: 

255 """ 

256 Get default parameter search space. 

257 

258 Returns: 

259 Dictionary defining the default parameter search spaces 

260 """ 

261 return { 

262 "iterations": { 

263 "type": "int", 

264 "low": 1, 

265 "high": 5, 

266 "step": 1, 

267 }, 

268 "questions_per_iteration": { 

269 "type": "int", 

270 "low": 1, 

271 "high": 5, 

272 "step": 1, 

273 }, 

274 "search_strategy": { 

275 "type": "categorical", 

276 "choices": [ 

277 "iterdrag", 

278 "standard", 

279 "rapid", 

280 "parallel", 

281 "source_based", 

282 ], 

283 }, 

284 "max_results": { 

285 "type": "int", 

286 "low": 10, 

287 "high": 100, 

288 "step": 10, 

289 }, 

290 } 

291 

292 def _objective( 

293 self, trial: optuna.Trial, param_space: Dict[str, Any] 

294 ) -> float: 

295 """ 

296 Objective function for Optuna optimization. 

297 

298 Args: 

299 trial: Optuna trial object 

300 param_space: Dictionary defining parameter search spaces 

301 

302 Returns: 

303 Score to maximize 

304 """ 

305 # Generate parameters for this trial 

306 params = {} 

307 for param_name, param_config in param_space.items(): 

308 param_type = param_config["type"] 

309 

310 if param_type == "int": 

311 params[param_name] = trial.suggest_int( 

312 param_name, 

313 param_config["low"], 

314 param_config["high"], 

315 step=param_config.get("step", 1), 

316 ) 

317 elif param_type == "float": 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 params[param_name] = trial.suggest_float( 

319 param_name, 

320 param_config["low"], 

321 param_config["high"], 

322 step=param_config.get("step"), 

323 log=param_config.get("log", False), 

324 ) 

325 elif param_type == "categorical": 325 ↛ 307line 325 didn't jump to line 307 because the condition on line 325 was always true

326 params[param_name] = trial.suggest_categorical( 

327 param_name, param_config["choices"] 

328 ) 

329 

330 # Log the trial parameters 

331 logger.info(f"Trial {trial.number}: {params}") 

332 

333 # Update progress callback if available 

334 if self.progress_callback: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true

335 self.progress_callback( 

336 trial.number, 

337 self.n_trials, 

338 { 

339 "status": "running", 

340 "stage": "trial_started", 

341 "trial_number": trial.number, 

342 "params": params, 

343 "trials_completed": trial.number, 

344 "total_trials": self.n_trials, 

345 }, 

346 ) 

347 

348 # Run an experiment with these parameters 

349 try: 

350 start_time = time.time() 

351 result = self._run_experiment(params) 

352 duration = time.time() - start_time 

353 

354 # Store details about the trial 

355 trial_info = { 

356 "trial_number": trial.number, 

357 "params": params, 

358 "result": result, 

359 "score": result.get("score", 0), 

360 "duration": duration, 

361 "timestamp": datetime.now(UTC).isoformat(), 

362 } 

363 self.trials_history.append(trial_info) 

364 

365 # Update callback with results 

366 if self.progress_callback: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 self.progress_callback( 

368 trial.number, 

369 self.n_trials, 

370 { 

371 "status": "completed", 

372 "stage": "trial_completed", 

373 "trial_number": trial.number, 

374 "params": params, 

375 "score": result.get("score", 0), 

376 "trials_completed": trial.number + 1, 

377 "total_trials": self.n_trials, 

378 }, 

379 ) 

380 

381 logger.info( 

382 f"Trial {trial.number} completed: {params}, score: {result['score']:.4f}" 

383 ) 

384 

385 return result["score"] 

386 except Exception as e: 

387 logger.exception(f"Error in trial {trial.number}") 

388 

389 # Update callback with error 

390 if self.progress_callback: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 self.progress_callback( 

392 trial.number, 

393 self.n_trials, 

394 { 

395 "status": "error", 

396 "stage": "trial_error", 

397 "trial_number": trial.number, 

398 "params": params, 

399 "error": str(e), 

400 "trials_completed": trial.number, 

401 "total_trials": self.n_trials, 

402 }, 

403 ) 

404 

405 return float("-inf") # Return a very low score for failed trials 

406 

407 def _run_experiment(self, params: Dict[str, Any]) -> Dict[str, Any]: 

408 """ 

409 Run a single experiment with the given parameters. 

410 

411 Args: 

412 params: Dictionary of parameters to test 

413 

414 Returns: 

415 Results dictionary with metrics and score 

416 """ 

417 # Extract parameters 

418 iterations = params.get("iterations", 2) 

419 questions_per_iteration = params.get("questions_per_iteration", 2) 

420 search_strategy = params.get("search_strategy", "iterdrag") 

421 max_results = params.get("max_results", 50) 

422 

423 # Initialize profiling tools 

424 speed_profiler = SpeedProfiler() 

425 

426 # Start profiling 

427 speed_profiler.start() 

428 

429 try: 

430 # Create system configuration 

431 system_config = { 

432 "iterations": iterations, 

433 "questions_per_iteration": questions_per_iteration, 

434 "search_strategy": search_strategy, 

435 "search_tool": self.search_tool, 

436 "max_results": max_results, 

437 "model_name": self.model_name, 

438 "provider": self.provider, 

439 } 

440 

441 # Evaluate quality using composite benchmark evaluator 

442 # Use a small number of examples for efficiency 

443 benchmark_dir = str(Path(self.output_dir) / "benchmark_temp") 

444 quality_results = self.benchmark_evaluator.evaluate( 

445 system_config=system_config, 

446 num_examples=5, # Small number for optimization efficiency 

447 output_dir=benchmark_dir, 

448 ) 

449 

450 # Stop timing 

451 speed_profiler.stop() 

452 timing_results = speed_profiler.get_summary() 

453 

454 # Extract key metrics 

455 quality_score = quality_results.get("quality_score", 0.0) 

456 benchmark_results = quality_results.get("benchmark_results", {}) 

457 

458 # Speed score: convert duration to a 0-1 score where faster is better 

459 # Using a reasonable threshold (e.g., 180 seconds for 5 examples) 

460 # Below this threshold: high score, above it: declining score 

461 total_duration = timing_results.get("total_duration", 180) 

462 speed_score = max(0.0, min(1.0, 1.0 - (total_duration - 60) / 180)) 

463 

464 # Calculate combined score based on weights 

465 combined_score = ( 

466 self.metric_weights.get("quality", 0.6) * quality_score 

467 + self.metric_weights.get("speed", 0.4) * speed_score 

468 ) 

469 

470 # Return streamlined results 

471 return { 

472 "quality_score": quality_score, 

473 "benchmark_results": benchmark_results, 

474 "speed_score": speed_score, 

475 "total_duration": total_duration, 

476 "score": combined_score, 

477 "success": True, 

478 } 

479 

480 except Exception as e: 

481 # Stop profiling on error 

482 speed_profiler.stop() 

483 

484 # Log error 

485 logger.exception("Error in experiment") 

486 

487 # Return error information 

488 return {"error": str(e), "score": 0.0, "success": False} 

489 

490 def _optimization_callback(self, study: optuna.Study, trial: optuna.Trial): 

491 """ 

492 Callback for the Optuna optimization process. 

493 

494 Args: 

495 study: Optuna study object 

496 trial: Current trial 

497 """ 

498 # Save intermediate results periodically 

499 if trial.number % 10 == 0 and trial.number > 0: 

500 self._save_results() 

501 self._create_quick_visualizations() 

502 

503 def _save_results(self): 

504 """Save the optimization results to disk.""" 

505 # Create a timestamp for filenames 

506 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

507 

508 # Save trial history 

509 from ...security.file_write_verifier import write_json_verified 

510 

511 history_file = str( 

512 Path(self.output_dir) / f"{self.study_name}_history.json" 

513 ) 

514 

515 # Convert numpy values to native Python types for JSON serialization 

516 clean_history = [] 

517 for trial in self.trials_history: 

518 clean_trial = {} 

519 for k, v in trial.items(): 

520 if isinstance(v, dict): 

521 clean_trial[k] = { 

522 dk: (float(dv) if isinstance(dv, np.number) else dv) 

523 for dk, dv in v.items() 

524 } 

525 elif isinstance(v, np.number): 525 ↛ 526line 525 didn't jump to line 526 because the condition on line 525 was never true

526 clean_trial[k] = float(v) 

527 else: 

528 clean_trial[k] = v 

529 clean_history.append(clean_trial) 

530 

531 # Sanitize sensitive data before writing to disk 

532 sanitized_history = sanitize_data(clean_history) 

533 

534 write_json_verified( 

535 history_file, 

536 sanitized_history, 

537 "benchmark.allow_file_output", 

538 context="optimization history", 

539 ) 

540 

541 # Save current best parameters 

542 if ( 542 ↛ 575line 542 didn't jump to line 575 because the condition on line 542 was always true

543 self.study 

544 and hasattr(self.study, "best_params") 

545 and self.study.best_params 

546 ): 

547 best_params_file = str( 

548 Path(self.output_dir) / f"{self.study_name}_best_params.json" 

549 ) 

550 

551 best_params_data = { 

552 "best_params": self.study.best_params, 

553 "best_value": float(self.study.best_value), 

554 "n_trials": len(self.study.trials), 

555 "timestamp": timestamp, 

556 "base_query": self.base_query, 

557 "model_name": self.model_name, 

558 "provider": self.provider, 

559 "search_tool": self.search_tool, 

560 "metric_weights": self.metric_weights, 

561 "benchmark_weights": self.benchmark_weights, 

562 } 

563 

564 # Sanitize sensitive data before writing to disk 

565 sanitized_best_params = sanitize_data(best_params_data) 

566 

567 write_json_verified( 

568 best_params_file, 

569 sanitized_best_params, 

570 "benchmark.allow_file_output", 

571 context="optimization best params", 

572 ) 

573 

574 # Save the Optuna study 

575 if self.study: 575 ↛ 581line 575 didn't jump to line 581 because the condition on line 575 was always true

576 study_file = str( 

577 Path(self.output_dir) / f"{self.study_name}_study.pkl" 

578 ) 

579 joblib.dump(self.study, study_file) 

580 

581 logger.info(f"Results saved to {self.output_dir}") 

582 

583 def _create_visualizations(self): 

584 """Create and save comprehensive visualizations of the optimization results.""" 

585 if not PLOTTING_AVAILABLE: 585 ↛ 586line 585 didn't jump to line 586 because the condition on line 585 was never true

586 logger.warning( 

587 "Matplotlib not available, skipping visualization creation" 

588 ) 

589 return 

590 

591 if not self.study or len(self.study.trials) < 2: 

592 logger.warning("Not enough trials to create visualizations") 

593 return 

594 

595 # Create directory for visualizations 

596 viz_dir = Path(self.output_dir) / "visualizations" 

597 viz_dir.mkdir(parents=True, exist_ok=True) 

598 viz_dir = str(viz_dir) 

599 

600 # Create Optuna visualizations 

601 self._create_optuna_visualizations(viz_dir) 

602 

603 # Create custom visualizations 

604 self._create_custom_visualizations(viz_dir) 

605 

606 logger.info(f"Visualizations saved to {viz_dir}") 

607 

608 def _create_quick_visualizations(self): 

609 """Create a smaller set of visualizations for intermediate progress.""" 

610 if ( 

611 not PLOTTING_AVAILABLE 

612 or not self.study 

613 or len(self.study.trials) < 2 

614 ): 

615 return 

616 

617 # Create directory for visualizations 

618 viz_dir = Path(self.output_dir) / "visualizations" 

619 viz_dir.mkdir(parents=True, exist_ok=True) 

620 viz_dir = str(viz_dir) 

621 

622 # Create optimization history only (faster than full visualization) 

623 try: 

624 fig = plot_optimization_history(self.study) 

625 fig.write_image( 

626 str( 

627 Path(viz_dir) 

628 / f"{self.study_name}_optimization_history_current.png" 

629 ) 

630 ) 

631 except Exception: 

632 logger.exception("Error creating optimization history plot") 

633 

634 def _create_optuna_visualizations(self, viz_dir: str): 

635 """ 

636 Create and save Optuna's built-in visualizations. 

637 

638 Args: 

639 viz_dir: Directory to save visualizations 

640 """ 

641 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

642 

643 # 1. Optimization history 

644 try: 

645 fig = plot_optimization_history(self.study) 

646 fig.write_image( 

647 str( 

648 Path(viz_dir) 

649 / f"{self.study_name}_optimization_history_{timestamp}.png" 

650 ) 

651 ) 

652 except Exception: 

653 logger.exception("Error creating optimization history plot") 

654 

655 # 2. Parameter importances 

656 try: 

657 fig = plot_param_importances(self.study) 

658 fig.write_image( 

659 str( 

660 Path(viz_dir) 

661 / f"{self.study_name}_param_importances_{timestamp}.png" 

662 ) 

663 ) 

664 except Exception: 

665 logger.exception("Error creating parameter importances plot") 

666 

667 # 3. Slice plot for each parameter 

668 try: 

669 for param_name in self.study.best_params.keys(): 

670 fig = plot_slice(self.study, [param_name]) 

671 fig.write_image( 

672 str( 

673 Path(viz_dir) 

674 / f"{self.study_name}_slice_{param_name}_{timestamp}.png" 

675 ) 

676 ) 

677 except Exception: 

678 logger.exception("Error creating slice plots") 

679 

680 # 4. Contour plots for important parameter pairs 

681 try: 

682 # Get all parameter names 

683 param_names = list(self.study.best_params.keys()) 

684 

685 # Create contour plots for each pair 

686 for i in range(len(param_names)): 

687 for j in range(i + 1, len(param_names)): 687 ↛ 688line 687 didn't jump to line 688 because the loop on line 687 never started

688 try: 

689 fig = plot_contour( 

690 self.study, params=[param_names[i], param_names[j]] 

691 ) 

692 fig.write_image( 

693 str( 

694 Path(viz_dir) 

695 / f"{self.study_name}_contour_{param_names[i]}_{param_names[j]}_{timestamp}.png" 

696 ) 

697 ) 

698 except Exception as e: 

699 logger.warning( 

700 f"Error creating contour plot for {param_names[i]} vs {param_names[j]}: {e!s}" 

701 ) 

702 except Exception: 

703 logger.exception("Error creating contour plots") 

704 

705 def _create_custom_visualizations(self, viz_dir: str): 

706 """ 

707 Create custom visualizations based on trial history. 

708 

709 Args: 

710 viz_dir: Directory to save visualizations 

711 """ 

712 if not self.trials_history: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 return 

714 

715 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S") 

716 

717 # Create quality vs speed plot 

718 self._create_quality_vs_speed_plot(viz_dir, timestamp) 

719 

720 # Create parameter evolution plots 

721 self._create_parameter_evolution_plots(viz_dir, timestamp) 

722 

723 # Create trial duration vs score plot 

724 self._create_duration_vs_score_plot(viz_dir, timestamp) 

725 

726 def _create_quality_vs_speed_plot(self, viz_dir: str, timestamp: str): 

727 """Create a plot showing quality vs. speed trade-off.""" 

728 if not self.trials_history: 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true

729 return 

730 

731 # Extract data from successful trials 

732 successful_trials = [ 

733 t 

734 for t in self.trials_history 

735 if t.get("result", {}).get("success", False) 

736 ] 

737 

738 if not successful_trials: 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true

739 logger.warning("No successful trials for visualization") 

740 return 

741 

742 try: 

743 plt.figure(figsize=(10, 8)) 

744 

745 # Extract metrics 

746 quality_scores = [] 

747 speed_scores = [] 

748 labels = [] 

749 iterations_values = [] 

750 questions_values = [] 

751 

752 for trial in successful_trials: 752 ↛ 766line 752 didn't jump to line 766 because the loop on line 752 didn't complete

753 result = trial["result"] 

754 quality = result.get("quality_score", 0) 

755 speed = result.get("speed_score", 0) 

756 iterations = trial["params"].get("iterations", 0) 

757 questions = trial["params"].get("questions_per_iteration", 0) 

758 

759 quality_scores.append(quality) 

760 speed_scores.append(speed) 

761 labels.append(f"Trial {trial['trial_number']}") 

762 iterations_values.append(iterations) 

763 questions_values.append(questions) 

764 

765 # Create scatter plot with size based on iterations*questions 

766 sizes = [ 

767 i * q * 5 

768 for i, q in zip( 

769 iterations_values, questions_values, strict=False 

770 ) 

771 ] 

772 scatter = plt.scatter( 

773 quality_scores, 

774 speed_scores, 

775 s=sizes, 

776 alpha=0.7, 

777 c=range(len(quality_scores)), 

778 cmap="viridis", 

779 ) 

780 

781 # Highlight best trial 

782 best_trial = max( 

783 successful_trials, 

784 key=lambda x: x.get("result", {}).get("score", 0), 

785 ) 

786 best_quality = best_trial["result"].get("quality_score", 0) 

787 best_speed = best_trial["result"].get("speed_score", 0) 

788 best_iter = best_trial["params"].get("iterations", 0) 

789 best_questions = best_trial["params"].get( 

790 "questions_per_iteration", 0 

791 ) 

792 

793 plt.scatter( 

794 [best_quality], 

795 [best_speed], 

796 s=200, 

797 facecolors="none", 

798 edgecolors="red", 

799 linewidth=2, 

800 label=f"Best: {best_iter}×{best_questions}", 

801 ) 

802 

803 # Add annotations for key points 

804 for i, (q, s, label) in enumerate( 

805 zip(quality_scores, speed_scores, labels, strict=False) 

806 ): 

807 if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points 

808 plt.annotate( 

809 f"{iterations_values[i]}×{questions_values[i]}", 

810 (q, s), 

811 xytext=(5, 5), 

812 textcoords="offset points", 

813 ) 

814 

815 # Add colorbar and labels 

816 cbar = plt.colorbar(scatter) 

817 cbar.set_label("Trial Progression") 

818 

819 # Add benchmark weight information 

820 weights_str = ", ".join( 

821 [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()] 

822 ) 

823 plt.title( 

824 f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}" 

825 ) 

826 plt.xlabel("Quality Score (Benchmark Accuracy)") 

827 plt.ylabel("Speed Score") 

828 plt.grid(True, linestyle="--", alpha=0.7) 

829 

830 # Add legend explaining size 

831 legend_elements = [ 

832 Line2D( 

833 [0], 

834 [0], 

835 marker="o", 

836 color="w", 

837 markerfacecolor="gray", 

838 markersize=np.sqrt(n * 5 / np.pi), 

839 label=f"{n} Total Questions", 

840 ) 

841 for n in [5, 10, 15, 20, 25] 

842 ] 

843 plt.legend(handles=legend_elements, title="Workload") 

844 

845 # Save the figure 

846 plt.tight_layout() 

847 plt.savefig( 

848 str( 

849 Path(viz_dir) 

850 / f"{self.study_name}_quality_vs_speed_{timestamp}.png" 

851 ) 

852 ) 

853 plt.close() 

854 except Exception: 

855 logger.exception("Error creating quality vs speed plot") 

856 

857 def _create_parameter_evolution_plots(self, viz_dir: str, timestamp: str): 

858 """Create plots showing how parameter values evolve over trials.""" 

859 try: 

860 successful_trials = [ 

861 t 

862 for t in self.trials_history 

863 if t.get("result", {}).get("success", False) 

864 ] 

865 

866 if not successful_trials or len(successful_trials) < 5: 866 ↛ 870line 866 didn't jump to line 870 because the condition on line 866 was always true

867 return 

868 

869 # Get key parameters 

870 main_params = list(successful_trials[0]["params"].keys()) 

871 

872 # For each parameter, plot its values over trials 

873 for param_name in main_params: 

874 plt.figure(figsize=(12, 6)) 

875 

876 trial_numbers = [] 

877 param_values = [] 

878 scores = [] 

879 

880 for trial in self.trials_history: 

881 if "params" in trial and param_name in trial["params"]: 

882 trial_numbers.append(trial["trial_number"]) 

883 param_values.append(trial["params"][param_name]) 

884 scores.append(trial.get("score", 0)) 

885 

886 # Create evolution plot 

887 scatter = plt.scatter( 

888 trial_numbers, 

889 param_values, 

890 c=scores, 

891 cmap="plasma", 

892 alpha=0.8, 

893 s=80, 

894 ) 

895 

896 # Add best trial marker 

897 best_trial_idx = scores.index(max(scores)) 

898 plt.scatter( 

899 [trial_numbers[best_trial_idx]], 

900 [param_values[best_trial_idx]], 

901 s=150, 

902 facecolors="none", 

903 edgecolors="red", 

904 linewidth=2, 

905 label=f"Best Value: {param_values[best_trial_idx]}", 

906 ) 

907 

908 # Add colorbar 

909 cbar = plt.colorbar(scatter) 

910 cbar.set_label("Score") 

911 

912 # Set chart properties 

913 plt.title(f"Evolution of {param_name} Values") 

914 plt.xlabel("Trial Number") 

915 plt.ylabel(param_name) 

916 plt.grid(True, linestyle="--", alpha=0.7) 

917 plt.legend() 

918 

919 # For categorical parameters, adjust y-axis 

920 if isinstance(param_values[0], str): 

921 unique_values = sorted(set(param_values)) 

922 plt.yticks(range(len(unique_values)), unique_values) 

923 

924 # Save the figure 

925 plt.tight_layout() 

926 plt.savefig( 

927 str( 

928 Path(viz_dir) 

929 / f"{self.study_name}_param_evolution_{param_name}_{timestamp}.png" 

930 ) 

931 ) 

932 plt.close() 

933 except Exception: 

934 logger.exception("Error creating parameter evolution plots") 

935 

936 def _create_duration_vs_score_plot(self, viz_dir: str, timestamp: str): 

937 """Create a plot showing trial duration vs score.""" 

938 try: 

939 plt.figure(figsize=(10, 6)) 

940 

941 successful_trials = [ 

942 t 

943 for t in self.trials_history 

944 if t.get("result", {}).get("success", False) 

945 ] 

946 

947 if not successful_trials: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true

948 return 

949 

950 trial_durations = [] 

951 trial_scores = [] 

952 trial_iterations = [] 

953 trial_questions = [] 

954 

955 for trial in successful_trials: 

956 duration = trial.get("duration", 0) 

957 score = trial.get("score", 0) 

958 iterations = trial.get("params", {}).get("iterations", 1) 

959 questions = trial.get("params", {}).get( 

960 "questions_per_iteration", 1 

961 ) 

962 

963 trial_durations.append(duration) 

964 trial_scores.append(score) 

965 trial_iterations.append(iterations) 

966 trial_questions.append(questions) 

967 

968 # Total questions per trial 

969 total_questions = [ 

970 i * q 

971 for i, q in zip(trial_iterations, trial_questions, strict=False) 

972 ] 

973 

974 # Create scatter plot with size based on total questions 

975 plt.scatter( 

976 trial_durations, 

977 trial_scores, 

978 s=[ 

979 q * 5 for q in total_questions 

980 ], # Size based on total questions 

981 alpha=0.7, 

982 c=range(len(trial_durations)), 

983 cmap="viridis", 

984 ) 

985 

986 # Add labels 

987 plt.xlabel("Trial Duration (seconds)") 

988 plt.ylabel("Score") 

989 plt.title("Trial Duration vs. Score") 

990 plt.grid(True, linestyle="--", alpha=0.7) 

991 

992 # Add trial number annotations for selected points 

993 for i, (d, s) in enumerate( 

994 zip(trial_durations, trial_scores, strict=False) 

995 ): 

996 if ( 996 ↛ 993line 996 didn't jump to line 993 because the condition on line 996 was always true

997 i % max(1, len(trial_durations) // 5) == 0 

998 ): # Annotate ~5 points 

999 plt.annotate( 

1000 f"{trial_iterations[i]}×{trial_questions[i]}", 

1001 (d, s), 

1002 xytext=(5, 5), 

1003 textcoords="offset points", 

1004 ) 

1005 

1006 # Save the figure 

1007 plt.tight_layout() 

1008 plt.savefig( 

1009 str( 

1010 Path(viz_dir) 

1011 / f"{self.study_name}_duration_vs_score_{timestamp}.png" 

1012 ) 

1013 ) 

1014 plt.close() 

1015 except Exception: 

1016 logger.exception("Error creating duration vs score plot") 

1017 

1018 

1019def optimize_parameters( 

1020 query: str, 

1021 param_space: Optional[Dict[str, Any]] = None, 

1022 output_dir: str = str(Path("data") / "optimization_results"), 

1023 model_name: Optional[str] = None, 

1024 provider: Optional[str] = None, 

1025 search_tool: Optional[str] = None, 

1026 temperature: float = 0.7, 

1027 n_trials: int = 30, 

1028 timeout: Optional[int] = None, 

1029 n_jobs: int = 1, 

1030 study_name: Optional[str] = None, 

1031 optimization_metrics: Optional[List[str]] = None, 

1032 metric_weights: Optional[Dict[str, float]] = None, 

1033 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1034 benchmark_weights: Optional[Dict[str, float]] = None, 

1035) -> Tuple[Dict[str, Any], float]: 

1036 """ 

1037 Optimize parameters for Local Deep Research. 

1038 

1039 Args: 

1040 query: The research query to use for all experiments 

1041 param_space: Dictionary defining parameter search spaces (optional) 

1042 output_dir: Directory to save optimization results 

1043 model_name: Name of the LLM model to use 

1044 provider: LLM provider 

1045 search_tool: Search engine to use 

1046 temperature: LLM temperature 

1047 n_trials: Number of parameter combinations to try 

1048 timeout: Maximum seconds to run optimization (None for no limit) 

1049 n_jobs: Number of parallel jobs for optimization 

1050 study_name: Name of the Optuna study 

1051 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"]) 

1052 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4}) 

1053 progress_callback: Optional callback for progress updates 

1054 benchmark_weights: Dictionary mapping benchmark types to weights 

1055 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1056 If None, only SimpleQA is used with weight 1.0 

1057 

1058 Returns: 

1059 Tuple of (best_parameters, best_score) 

1060 """ 

1061 # Create optimizer 

1062 optimizer = OptunaOptimizer( 

1063 base_query=query, 

1064 output_dir=output_dir, 

1065 model_name=model_name, 

1066 provider=provider, 

1067 search_tool=search_tool, 

1068 temperature=temperature, 

1069 n_trials=n_trials, 

1070 timeout=timeout, 

1071 n_jobs=n_jobs, 

1072 study_name=study_name, 

1073 optimization_metrics=optimization_metrics, 

1074 metric_weights=metric_weights, 

1075 progress_callback=progress_callback, 

1076 benchmark_weights=benchmark_weights, 

1077 ) 

1078 

1079 # Run optimization 

1080 return optimizer.optimize(param_space) 

1081 

1082 

1083def optimize_for_speed( 

1084 query: str, 

1085 n_trials: int = 20, 

1086 output_dir: str = str(Path("data") / "optimization_results"), 

1087 model_name: Optional[str] = None, 

1088 provider: Optional[str] = None, 

1089 search_tool: Optional[str] = None, 

1090 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1091 benchmark_weights: Optional[Dict[str, float]] = None, 

1092) -> Tuple[Dict[str, Any], float]: 

1093 """ 

1094 Optimize parameters with a focus on speed performance. 

1095 

1096 Args: 

1097 query: The research query to use for all experiments 

1098 n_trials: Number of parameter combinations to try 

1099 output_dir: Directory to save optimization results 

1100 model_name: Name of the LLM model to use 

1101 provider: LLM provider 

1102 search_tool: Search engine to use 

1103 progress_callback: Optional callback for progress updates 

1104 benchmark_weights: Dictionary mapping benchmark types to weights 

1105 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1106 If None, only SimpleQA is used with weight 1.0 

1107 

1108 Returns: 

1109 Tuple of (best_parameters, best_score) 

1110 """ 

1111 # Focus on speed with reduced parameter space 

1112 param_space = { 

1113 "iterations": { 

1114 "type": "int", 

1115 "low": 1, 

1116 "high": 3, 

1117 "step": 1, 

1118 }, 

1119 "questions_per_iteration": { 

1120 "type": "int", 

1121 "low": 1, 

1122 "high": 3, 

1123 "step": 1, 

1124 }, 

1125 "search_strategy": { 

1126 "type": "categorical", 

1127 "choices": ["rapid", "parallel", "source_based"], 

1128 }, 

1129 } 

1130 

1131 # Speed-focused weights 

1132 metric_weights = {"speed": 0.8, "quality": 0.2} 

1133 

1134 return optimize_parameters( 

1135 query=query, 

1136 param_space=param_space, 

1137 output_dir=output_dir, 

1138 model_name=model_name, 

1139 provider=provider, 

1140 search_tool=search_tool, 

1141 n_trials=n_trials, 

1142 metric_weights=metric_weights, 

1143 optimization_metrics=["speed", "quality"], 

1144 progress_callback=progress_callback, 

1145 benchmark_weights=benchmark_weights, 

1146 ) 

1147 

1148 

1149def optimize_for_quality( 

1150 query: str, 

1151 n_trials: int = 30, 

1152 output_dir: str = str(Path("data") / "optimization_results"), 

1153 model_name: Optional[str] = None, 

1154 provider: Optional[str] = None, 

1155 search_tool: Optional[str] = None, 

1156 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1157 benchmark_weights: Optional[Dict[str, float]] = None, 

1158) -> Tuple[Dict[str, Any], float]: 

1159 """ 

1160 Optimize parameters with a focus on result quality. 

1161 

1162 Args: 

1163 query: The research query to use for all experiments 

1164 n_trials: Number of parameter combinations to try 

1165 output_dir: Directory to save optimization results 

1166 model_name: Name of the LLM model to use 

1167 provider: LLM provider 

1168 search_tool: Search engine to use 

1169 progress_callback: Optional callback for progress updates 

1170 benchmark_weights: Dictionary mapping benchmark types to weights 

1171 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1172 If None, only SimpleQA is used with weight 1.0 

1173 

1174 Returns: 

1175 Tuple of (best_parameters, best_score) 

1176 """ 

1177 # Quality-focused weights 

1178 metric_weights = {"quality": 0.9, "speed": 0.1} 

1179 

1180 return optimize_parameters( 

1181 query=query, 

1182 output_dir=output_dir, 

1183 model_name=model_name, 

1184 provider=provider, 

1185 search_tool=search_tool, 

1186 n_trials=n_trials, 

1187 metric_weights=metric_weights, 

1188 optimization_metrics=["quality", "speed"], 

1189 progress_callback=progress_callback, 

1190 benchmark_weights=benchmark_weights, 

1191 ) 

1192 

1193 

1194def optimize_for_efficiency( 

1195 query: str, 

1196 n_trials: int = 25, 

1197 output_dir: str = str(Path("data") / "optimization_results"), 

1198 model_name: Optional[str] = None, 

1199 provider: Optional[str] = None, 

1200 search_tool: Optional[str] = None, 

1201 progress_callback: Optional[Callable[[int, int, Dict], None]] = None, 

1202 benchmark_weights: Optional[Dict[str, float]] = None, 

1203) -> Tuple[Dict[str, Any], float]: 

1204 """ 

1205 Optimize parameters with a focus on resource efficiency. 

1206 

1207 Args: 

1208 query: The research query to use for all experiments 

1209 n_trials: Number of parameter combinations to try 

1210 output_dir: Directory to save optimization results 

1211 model_name: Name of the LLM model to use 

1212 provider: LLM provider 

1213 search_tool: Search engine to use 

1214 progress_callback: Optional callback for progress updates 

1215 benchmark_weights: Dictionary mapping benchmark types to weights 

1216 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4}) 

1217 If None, only SimpleQA is used with weight 1.0 

1218 

1219 Returns: 

1220 Tuple of (best_parameters, best_score) 

1221 """ 

1222 # Balance of quality, speed and resource usage 

1223 metric_weights = {"quality": 0.4, "speed": 0.3, "resource": 0.3} 

1224 

1225 return optimize_parameters( 

1226 query=query, 

1227 output_dir=output_dir, 

1228 model_name=model_name, 

1229 provider=provider, 

1230 search_tool=search_tool, 

1231 n_trials=n_trials, 

1232 metric_weights=metric_weights, 

1233 optimization_metrics=["quality", "speed", "resource"], 

1234 progress_callback=progress_callback, 

1235 benchmark_weights=benchmark_weights, 

1236 )