Coverage for src / local_deep_research / benchmarks / optimization / optuna_optimizer.py: 9%
346 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Optuna-based parameter optimizer for Local Deep Research.
4This module provides the core optimization functionality using Optuna
5to find optimal parameters for the research system, balancing quality
6and performance metrics.
7"""
9import os
10from pathlib import Path
11import time
12from datetime import datetime, UTC
13from functools import partial
14from typing import Any, Callable, Dict, List, Optional, Tuple
16import joblib
17import numpy as np
18import optuna
19from optuna.visualization import (
20 plot_contour,
21 plot_optimization_history,
22 plot_param_importances,
23 plot_slice,
24)
26from local_deep_research.benchmarks.efficiency.speed_profiler import (
27 SpeedProfiler,
28)
29from local_deep_research.security import sanitize_data
30from loguru import logger
32from local_deep_research.benchmarks.evaluators import (
33 CompositeBenchmarkEvaluator,
34)
36# Import benchmark evaluator components
38# Try to import visualization libraries, but don't fail if not available
39try:
40 import matplotlib.pyplot as plt
41 from matplotlib.lines import Line2D
43 # We'll use matplotlib for plotting visualization results
45 PLOTTING_AVAILABLE = True
46except ImportError:
47 PLOTTING_AVAILABLE = False
48 logger.warning("Matplotlib not available, visualization will be limited")
51class OptunaOptimizer:
52 """
53 Optimize parameters for Local Deep Research using Optuna.
55 This class provides functionality to:
56 1. Define search spaces for parameter optimization
57 2. Evaluate parameter combinations using objective functions
58 3. Find optimal parameters via Optuna
59 4. Visualize and analyze optimization results
60 """
62 def __init__(
63 self,
64 base_query: str,
65 output_dir: str = "optimization_results",
66 model_name: Optional[str] = None,
67 provider: Optional[str] = None,
68 search_tool: Optional[str] = None,
69 temperature: float = 0.7,
70 n_trials: int = 30,
71 timeout: Optional[int] = None,
72 n_jobs: int = 1,
73 study_name: Optional[str] = None,
74 optimization_metrics: Optional[List[str]] = None,
75 metric_weights: Optional[Dict[str, float]] = None,
76 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
77 benchmark_weights: Optional[Dict[str, float]] = None,
78 ):
79 """
80 Initialize the optimizer.
82 Args:
83 base_query: The research query to use for all experiments
84 output_dir: Directory to save optimization results
85 model_name: Name of the LLM model to use
86 provider: LLM provider
87 search_tool: Search engine to use
88 temperature: LLM temperature
89 n_trials: Number of parameter combinations to try
90 timeout: Maximum seconds to run optimization (None for no limit)
91 n_jobs: Number of parallel jobs for optimization
92 study_name: Name of the Optuna study
93 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])
94 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})
95 progress_callback: Optional callback for progress updates
96 benchmark_weights: Dictionary mapping benchmark types to weights
97 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
98 If None, only SimpleQA is used with weight 1.0
99 """
100 self.base_query = base_query
101 self.output_dir = output_dir
102 self.model_name = model_name
103 self.provider = provider
104 self.search_tool = search_tool
105 self.temperature = temperature
106 self.n_trials = n_trials
107 self.timeout = timeout
108 self.n_jobs = n_jobs
109 self.optimization_metrics = optimization_metrics or ["quality", "speed"]
110 self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}
111 self.progress_callback = progress_callback
113 # Initialize benchmark evaluator with weights
114 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
115 self.benchmark_evaluator = CompositeBenchmarkEvaluator(
116 self.benchmark_weights
117 )
119 # Normalize weights to sum to 1.0
120 total_weight = sum(self.metric_weights.values())
121 if total_weight > 0:
122 self.metric_weights = {
123 k: v / total_weight for k, v in self.metric_weights.items()
124 }
126 # Generate a unique study name if not provided
127 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
128 self.study_name = study_name or f"ldr_opt_{timestamp}"
130 # Create output directory
131 os.makedirs(output_dir, exist_ok=True)
133 # Store the trial history for analysis
134 self.trials_history = []
136 # Storage for the best parameters and study
137 self.best_params = None
138 self.study = None
140 def optimize(
141 self, param_space: Optional[Dict[str, Any]] = None
142 ) -> Tuple[Dict[str, Any], float]:
143 """
144 Run the optimization process using Optuna.
146 Args:
147 param_space: Dictionary defining parameter search spaces
148 (if None, use default spaces)
150 Returns:
151 Tuple containing (best_parameters, best_score)
152 """
153 param_space = param_space or self._get_default_param_space()
155 # Create a study object
156 storage_name = f"sqlite:///{self.output_dir}/{self.study_name}.db"
157 self.study = optuna.create_study(
158 study_name=self.study_name,
159 storage=storage_name,
160 load_if_exists=True,
161 direction="maximize",
162 sampler=optuna.samplers.TPESampler(seed=42),
163 )
165 # Create partial function with param_space
166 objective = partial(self._objective, param_space=param_space)
168 # Log optimization start
169 logger.info(
170 f"Starting optimization with {self.n_trials} trials, {self.n_jobs} parallel jobs"
171 )
172 logger.info(f"Parameter space: {param_space}")
173 logger.info(f"Metric weights: {self.metric_weights}")
174 logger.info(f"Benchmark weights: {self.benchmark_weights}")
176 # Initialize progress tracking
177 if self.progress_callback:
178 self.progress_callback(
179 0,
180 self.n_trials,
181 {
182 "status": "starting",
183 "stage": "initialization",
184 "trials_completed": 0,
185 "total_trials": self.n_trials,
186 },
187 )
189 try:
190 # Run optimization
191 self.study.optimize(
192 objective,
193 n_trials=self.n_trials,
194 timeout=self.timeout,
195 n_jobs=self.n_jobs,
196 callbacks=[self._optimization_callback],
197 show_progress_bar=True,
198 )
200 # Store best parameters
201 self.best_params = self.study.best_params
203 # Save the results
204 self._save_results()
206 # Create visualizations
207 self._create_visualizations()
209 logger.info(
210 f"Optimization complete. Best parameters: {self.best_params}"
211 )
212 logger.info(f"Best value: {self.study.best_value}")
214 # Report completion
215 if self.progress_callback:
216 self.progress_callback(
217 self.n_trials,
218 self.n_trials,
219 {
220 "status": "completed",
221 "stage": "finished",
222 "trials_completed": len(self.study.trials),
223 "total_trials": self.n_trials,
224 "best_params": self.best_params,
225 "best_value": self.study.best_value,
226 },
227 )
229 return self.best_params, self.study.best_value
231 except KeyboardInterrupt:
232 logger.info("Optimization interrupted by user")
233 # Still save what we have
234 self._save_results()
235 self._create_visualizations()
237 # Report interruption
238 if self.progress_callback:
239 self.progress_callback(
240 len(self.study.trials),
241 self.n_trials,
242 {
243 "status": "interrupted",
244 "stage": "interrupted",
245 "trials_completed": len(self.study.trials),
246 "total_trials": self.n_trials,
247 "best_params": self.study.best_params,
248 "best_value": self.study.best_value,
249 },
250 )
252 return self.study.best_params, self.study.best_value
254 def _get_default_param_space(self) -> Dict[str, Any]:
255 """
256 Get default parameter search space.
258 Returns:
259 Dictionary defining the default parameter search spaces
260 """
261 return {
262 "iterations": {
263 "type": "int",
264 "low": 1,
265 "high": 5,
266 "step": 1,
267 },
268 "questions_per_iteration": {
269 "type": "int",
270 "low": 1,
271 "high": 5,
272 "step": 1,
273 },
274 "search_strategy": {
275 "type": "categorical",
276 "choices": [
277 "iterdrag",
278 "standard",
279 "rapid",
280 "parallel",
281 "source_based",
282 ],
283 },
284 "max_results": {
285 "type": "int",
286 "low": 10,
287 "high": 100,
288 "step": 10,
289 },
290 }
292 def _objective(
293 self, trial: optuna.Trial, param_space: Dict[str, Any]
294 ) -> float:
295 """
296 Objective function for Optuna optimization.
298 Args:
299 trial: Optuna trial object
300 param_space: Dictionary defining parameter search spaces
302 Returns:
303 Score to maximize
304 """
305 # Generate parameters for this trial
306 params = {}
307 for param_name, param_config in param_space.items():
308 param_type = param_config["type"]
310 if param_type == "int":
311 params[param_name] = trial.suggest_int(
312 param_name,
313 param_config["low"],
314 param_config["high"],
315 step=param_config.get("step", 1),
316 )
317 elif param_type == "float":
318 params[param_name] = trial.suggest_float(
319 param_name,
320 param_config["low"],
321 param_config["high"],
322 step=param_config.get("step"),
323 log=param_config.get("log", False),
324 )
325 elif param_type == "categorical":
326 params[param_name] = trial.suggest_categorical(
327 param_name, param_config["choices"]
328 )
330 # Log the trial parameters
331 logger.info(f"Trial {trial.number}: {params}")
333 # Update progress callback if available
334 if self.progress_callback:
335 self.progress_callback(
336 trial.number,
337 self.n_trials,
338 {
339 "status": "running",
340 "stage": "trial_started",
341 "trial_number": trial.number,
342 "params": params,
343 "trials_completed": trial.number,
344 "total_trials": self.n_trials,
345 },
346 )
348 # Run an experiment with these parameters
349 try:
350 start_time = time.time()
351 result = self._run_experiment(params)
352 duration = time.time() - start_time
354 # Store details about the trial
355 trial_info = {
356 "trial_number": trial.number,
357 "params": params,
358 "result": result,
359 "score": result.get("score", 0),
360 "duration": duration,
361 "timestamp": datetime.now(UTC).isoformat(),
362 }
363 self.trials_history.append(trial_info)
365 # Update callback with results
366 if self.progress_callback:
367 self.progress_callback(
368 trial.number,
369 self.n_trials,
370 {
371 "status": "completed",
372 "stage": "trial_completed",
373 "trial_number": trial.number,
374 "params": params,
375 "score": result.get("score", 0),
376 "trials_completed": trial.number + 1,
377 "total_trials": self.n_trials,
378 },
379 )
381 logger.info(
382 f"Trial {trial.number} completed: {params}, score: {result['score']:.4f}"
383 )
385 return result["score"]
386 except Exception as e:
387 logger.exception(f"Error in trial {trial.number}: {e!s}")
389 # Update callback with error
390 if self.progress_callback:
391 self.progress_callback(
392 trial.number,
393 self.n_trials,
394 {
395 "status": "error",
396 "stage": "trial_error",
397 "trial_number": trial.number,
398 "params": params,
399 "error": str(e),
400 "trials_completed": trial.number,
401 "total_trials": self.n_trials,
402 },
403 )
405 return float("-inf") # Return a very low score for failed trials
407 def _run_experiment(self, params: Dict[str, Any]) -> Dict[str, Any]:
408 """
409 Run a single experiment with the given parameters.
411 Args:
412 params: Dictionary of parameters to test
414 Returns:
415 Results dictionary with metrics and score
416 """
417 # Extract parameters
418 iterations = params.get("iterations", 2)
419 questions_per_iteration = params.get("questions_per_iteration", 2)
420 search_strategy = params.get("search_strategy", "iterdrag")
421 max_results = params.get("max_results", 50)
423 # Initialize profiling tools
424 speed_profiler = SpeedProfiler()
426 # Start profiling
427 speed_profiler.start()
429 try:
430 # Create system configuration
431 system_config = {
432 "iterations": iterations,
433 "questions_per_iteration": questions_per_iteration,
434 "search_strategy": search_strategy,
435 "search_tool": self.search_tool,
436 "max_results": max_results,
437 "model_name": self.model_name,
438 "provider": self.provider,
439 }
441 # Evaluate quality using composite benchmark evaluator
442 # Use a small number of examples for efficiency
443 benchmark_dir = str(Path(self.output_dir) / "benchmark_temp")
444 quality_results = self.benchmark_evaluator.evaluate(
445 system_config=system_config,
446 num_examples=5, # Small number for optimization efficiency
447 output_dir=benchmark_dir,
448 )
450 # Stop timing
451 speed_profiler.stop()
452 timing_results = speed_profiler.get_summary()
454 # Extract key metrics
455 quality_score = quality_results.get("quality_score", 0.0)
456 benchmark_results = quality_results.get("benchmark_results", {})
458 # Speed score: convert duration to a 0-1 score where faster is better
459 # Using a reasonable threshold (e.g., 180 seconds for 5 examples)
460 # Below this threshold: high score, above it: declining score
461 total_duration = timing_results.get("total_duration", 180)
462 speed_score = max(0.0, min(1.0, 1.0 - (total_duration - 60) / 180))
464 # Calculate combined score based on weights
465 combined_score = (
466 self.metric_weights.get("quality", 0.6) * quality_score
467 + self.metric_weights.get("speed", 0.4) * speed_score
468 )
470 # Return streamlined results
471 return {
472 "quality_score": quality_score,
473 "benchmark_results": benchmark_results,
474 "speed_score": speed_score,
475 "total_duration": total_duration,
476 "score": combined_score,
477 "success": True,
478 }
480 except Exception as e:
481 # Stop profiling on error
482 speed_profiler.stop()
484 # Log error
485 logger.exception(f"Error in experiment: {e!s}")
487 # Return error information
488 return {"error": str(e), "score": 0.0, "success": False}
490 def _optimization_callback(self, study: optuna.Study, trial: optuna.Trial):
491 """
492 Callback for the Optuna optimization process.
494 Args:
495 study: Optuna study object
496 trial: Current trial
497 """
498 # Save intermediate results periodically
499 if trial.number % 10 == 0 and trial.number > 0:
500 self._save_results()
501 self._create_quick_visualizations()
503 def _save_results(self):
504 """Save the optimization results to disk."""
505 # Create a timestamp for filenames
506 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
508 # Save trial history
509 from ...security.file_write_verifier import write_json_verified
511 history_file = str(
512 Path(self.output_dir) / f"{self.study_name}_history.json"
513 )
515 # Convert numpy values to native Python types for JSON serialization
516 clean_history = []
517 for trial in self.trials_history:
518 clean_trial = {}
519 for k, v in trial.items():
520 if isinstance(v, dict):
521 clean_trial[k] = {
522 dk: (float(dv) if isinstance(dv, np.number) else dv)
523 for dk, dv in v.items()
524 }
525 elif isinstance(v, np.number):
526 clean_trial[k] = float(v)
527 else:
528 clean_trial[k] = v
529 clean_history.append(clean_trial)
531 # Sanitize sensitive data before writing to disk
532 sanitized_history = sanitize_data(clean_history)
534 write_json_verified(
535 history_file,
536 sanitized_history,
537 "benchmark.allow_file_output",
538 context="optimization history",
539 )
541 # Save current best parameters
542 if (
543 self.study
544 and hasattr(self.study, "best_params")
545 and self.study.best_params
546 ):
547 best_params_file = str(
548 Path(self.output_dir) / f"{self.study_name}_best_params.json"
549 )
551 best_params_data = {
552 "best_params": self.study.best_params,
553 "best_value": float(self.study.best_value),
554 "n_trials": len(self.study.trials),
555 "timestamp": timestamp,
556 "base_query": self.base_query,
557 "model_name": self.model_name,
558 "provider": self.provider,
559 "search_tool": self.search_tool,
560 "metric_weights": self.metric_weights,
561 "benchmark_weights": self.benchmark_weights,
562 }
564 # Sanitize sensitive data before writing to disk
565 sanitized_best_params = sanitize_data(best_params_data)
567 write_json_verified(
568 best_params_file,
569 sanitized_best_params,
570 "benchmark.allow_file_output",
571 context="optimization best params",
572 )
574 # Save the Optuna study
575 if self.study:
576 study_file = str(
577 Path(self.output_dir) / f"{self.study_name}_study.pkl"
578 )
579 joblib.dump(self.study, study_file)
581 logger.info(f"Results saved to {self.output_dir}")
583 def _create_visualizations(self):
584 """Create and save comprehensive visualizations of the optimization results."""
585 if not PLOTTING_AVAILABLE:
586 logger.warning(
587 "Matplotlib not available, skipping visualization creation"
588 )
589 return
591 if not self.study or len(self.study.trials) < 2:
592 logger.warning("Not enough trials to create visualizations")
593 return
595 # Create directory for visualizations
596 viz_dir = Path(self.output_dir) / "visualizations"
597 viz_dir.mkdir(parents=True, exist_ok=True)
598 viz_dir = str(viz_dir)
600 # Create Optuna visualizations
601 self._create_optuna_visualizations(viz_dir)
603 # Create custom visualizations
604 self._create_custom_visualizations(viz_dir)
606 logger.info(f"Visualizations saved to {viz_dir}")
608 def _create_quick_visualizations(self):
609 """Create a smaller set of visualizations for intermediate progress."""
610 if (
611 not PLOTTING_AVAILABLE
612 or not self.study
613 or len(self.study.trials) < 2
614 ):
615 return
617 # Create directory for visualizations
618 viz_dir = Path(self.output_dir) / "visualizations"
619 viz_dir.mkdir(parents=True, exist_ok=True)
620 viz_dir = str(viz_dir)
622 # Create optimization history only (faster than full visualization)
623 try:
624 fig = plot_optimization_history(self.study)
625 fig.write_image(
626 str(
627 Path(viz_dir)
628 / f"{self.study_name}_optimization_history_current.png"
629 )
630 )
631 except Exception as e:
632 logger.exception(f"Error creating optimization history plot: {e!s}")
634 def _create_optuna_visualizations(self, viz_dir: str):
635 """
636 Create and save Optuna's built-in visualizations.
638 Args:
639 viz_dir: Directory to save visualizations
640 """
641 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
643 # 1. Optimization history
644 try:
645 fig = plot_optimization_history(self.study)
646 fig.write_image(
647 str(
648 Path(viz_dir)
649 / f"{self.study_name}_optimization_history_{timestamp}.png"
650 )
651 )
652 except Exception as e:
653 logger.exception(f"Error creating optimization history plot: {e!s}")
655 # 2. Parameter importances
656 try:
657 fig = plot_param_importances(self.study)
658 fig.write_image(
659 str(
660 Path(viz_dir)
661 / f"{self.study_name}_param_importances_{timestamp}.png"
662 )
663 )
664 except Exception as e:
665 logger.exception(
666 f"Error creating parameter importances plot: {e!s}"
667 )
669 # 3. Slice plot for each parameter
670 try:
671 for param_name in self.study.best_params.keys():
672 fig = plot_slice(self.study, [param_name])
673 fig.write_image(
674 str(
675 Path(viz_dir)
676 / f"{self.study_name}_slice_{param_name}_{timestamp}.png"
677 )
678 )
679 except Exception as e:
680 logger.exception(f"Error creating slice plots: {e!s}")
682 # 4. Contour plots for important parameter pairs
683 try:
684 # Get all parameter names
685 param_names = list(self.study.best_params.keys())
687 # Create contour plots for each pair
688 for i in range(len(param_names)):
689 for j in range(i + 1, len(param_names)):
690 try:
691 fig = plot_contour(
692 self.study, params=[param_names[i], param_names[j]]
693 )
694 fig.write_image(
695 str(
696 Path(viz_dir)
697 / f"{self.study_name}_contour_{param_names[i]}_{param_names[j]}_{timestamp}.png"
698 )
699 )
700 except Exception as e:
701 logger.warning(
702 f"Error creating contour plot for {param_names[i]} vs {param_names[j]}: {e!s}"
703 )
704 except Exception as e:
705 logger.exception(f"Error creating contour plots: {e!s}")
707 def _create_custom_visualizations(self, viz_dir: str):
708 """
709 Create custom visualizations based on trial history.
711 Args:
712 viz_dir: Directory to save visualizations
713 """
714 if not self.trials_history:
715 return
717 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
719 # Create quality vs speed plot
720 self._create_quality_vs_speed_plot(viz_dir, timestamp)
722 # Create parameter evolution plots
723 self._create_parameter_evolution_plots(viz_dir, timestamp)
725 # Create trial duration vs score plot
726 self._create_duration_vs_score_plot(viz_dir, timestamp)
728 def _create_quality_vs_speed_plot(self, viz_dir: str, timestamp: str):
729 """Create a plot showing quality vs. speed trade-off."""
730 if not self.trials_history:
731 return
733 # Extract data from successful trials
734 successful_trials = [
735 t
736 for t in self.trials_history
737 if t.get("result", {}).get("success", False)
738 ]
740 if not successful_trials:
741 logger.warning("No successful trials for visualization")
742 return
744 try:
745 plt.figure(figsize=(10, 8))
747 # Extract metrics
748 quality_scores = []
749 speed_scores = []
750 labels = []
751 iterations_values = []
752 questions_values = []
754 for trial in successful_trials:
755 result = trial["result"]
756 quality = result.get("quality_score", 0)
757 speed = result.get("speed_score", 0)
758 iterations = trial["params"].get("iterations", 0)
759 questions = trial["params"].get("questions_per_iteration", 0)
761 quality_scores.append(quality)
762 speed_scores.append(speed)
763 labels.append(f"Trial {trial['trial_number']}")
764 iterations_values.append(iterations)
765 questions_values.append(questions)
767 # Create scatter plot with size based on iterations*questions
768 sizes = [
769 i * q * 5
770 for i, q in zip(
771 iterations_values, questions_values, strict=False
772 )
773 ]
774 scatter = plt.scatter(
775 quality_scores,
776 speed_scores,
777 s=sizes,
778 alpha=0.7,
779 c=range(len(quality_scores)),
780 cmap="viridis",
781 )
783 # Highlight best trial
784 best_trial = max(
785 successful_trials,
786 key=lambda x: x.get("result", {}).get("score", 0),
787 )
788 best_quality = best_trial["result"].get("quality_score", 0)
789 best_speed = best_trial["result"].get("speed_score", 0)
790 best_iter = best_trial["params"].get("iterations", 0)
791 best_questions = best_trial["params"].get(
792 "questions_per_iteration", 0
793 )
795 plt.scatter(
796 [best_quality],
797 [best_speed],
798 s=200,
799 facecolors="none",
800 edgecolors="red",
801 linewidth=2,
802 label=f"Best: {best_iter}×{best_questions}",
803 )
805 # Add annotations for key points
806 for i, (q, s, label) in enumerate(
807 zip(quality_scores, speed_scores, labels, strict=False)
808 ):
809 if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points
810 plt.annotate(
811 f"{iterations_values[i]}×{questions_values[i]}",
812 (q, s),
813 xytext=(5, 5),
814 textcoords="offset points",
815 )
817 # Add colorbar and labels
818 cbar = plt.colorbar(scatter)
819 cbar.set_label("Trial Progression")
821 # Add benchmark weight information
822 weights_str = ", ".join(
823 [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()]
824 )
825 plt.title(
826 f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}"
827 )
828 plt.xlabel("Quality Score (Benchmark Accuracy)")
829 plt.ylabel("Speed Score")
830 plt.grid(True, linestyle="--", alpha=0.7)
832 # Add legend explaining size
833 legend_elements = [
834 Line2D(
835 [0],
836 [0],
837 marker="o",
838 color="w",
839 markerfacecolor="gray",
840 markersize=np.sqrt(n * 5 / np.pi),
841 label=f"{n} Total Questions",
842 )
843 for n in [5, 10, 15, 20, 25]
844 ]
845 plt.legend(handles=legend_elements, title="Workload")
847 # Save the figure
848 plt.tight_layout()
849 plt.savefig(
850 str(
851 Path(viz_dir)
852 / f"{self.study_name}_quality_vs_speed_{timestamp}.png"
853 )
854 )
855 plt.close()
856 except Exception as e:
857 logger.exception(f"Error creating quality vs speed plot: {e!s}")
859 def _create_parameter_evolution_plots(self, viz_dir: str, timestamp: str):
860 """Create plots showing how parameter values evolve over trials."""
861 try:
862 successful_trials = [
863 t
864 for t in self.trials_history
865 if t.get("result", {}).get("success", False)
866 ]
868 if not successful_trials or len(successful_trials) < 5:
869 return
871 # Get key parameters
872 main_params = list(successful_trials[0]["params"].keys())
874 # For each parameter, plot its values over trials
875 for param_name in main_params:
876 plt.figure(figsize=(12, 6))
878 trial_numbers = []
879 param_values = []
880 scores = []
882 for trial in self.trials_history:
883 if "params" in trial and param_name in trial["params"]:
884 trial_numbers.append(trial["trial_number"])
885 param_values.append(trial["params"][param_name])
886 scores.append(trial.get("score", 0))
888 # Create evolution plot
889 scatter = plt.scatter(
890 trial_numbers,
891 param_values,
892 c=scores,
893 cmap="plasma",
894 alpha=0.8,
895 s=80,
896 )
898 # Add best trial marker
899 best_trial_idx = scores.index(max(scores))
900 plt.scatter(
901 [trial_numbers[best_trial_idx]],
902 [param_values[best_trial_idx]],
903 s=150,
904 facecolors="none",
905 edgecolors="red",
906 linewidth=2,
907 label=f"Best Value: {param_values[best_trial_idx]}",
908 )
910 # Add colorbar
911 cbar = plt.colorbar(scatter)
912 cbar.set_label("Score")
914 # Set chart properties
915 plt.title(f"Evolution of {param_name} Values")
916 plt.xlabel("Trial Number")
917 plt.ylabel(param_name)
918 plt.grid(True, linestyle="--", alpha=0.7)
919 plt.legend()
921 # For categorical parameters, adjust y-axis
922 if isinstance(param_values[0], str):
923 unique_values = sorted(set(param_values))
924 plt.yticks(range(len(unique_values)), unique_values)
926 # Save the figure
927 plt.tight_layout()
928 plt.savefig(
929 str(
930 Path(viz_dir)
931 / f"{self.study_name}_param_evolution_{param_name}_{timestamp}.png"
932 )
933 )
934 plt.close()
935 except Exception as e:
936 logger.exception(f"Error creating parameter evolution plots: {e!s}")
938 def _create_duration_vs_score_plot(self, viz_dir: str, timestamp: str):
939 """Create a plot showing trial duration vs score."""
940 try:
941 plt.figure(figsize=(10, 6))
943 successful_trials = [
944 t
945 for t in self.trials_history
946 if t.get("result", {}).get("success", False)
947 ]
949 if not successful_trials:
950 return
952 trial_durations = []
953 trial_scores = []
954 trial_iterations = []
955 trial_questions = []
957 for trial in successful_trials:
958 duration = trial.get("duration", 0)
959 score = trial.get("score", 0)
960 iterations = trial.get("params", {}).get("iterations", 1)
961 questions = trial.get("params", {}).get(
962 "questions_per_iteration", 1
963 )
965 trial_durations.append(duration)
966 trial_scores.append(score)
967 trial_iterations.append(iterations)
968 trial_questions.append(questions)
970 # Total questions per trial
971 total_questions = [
972 i * q
973 for i, q in zip(trial_iterations, trial_questions, strict=False)
974 ]
976 # Create scatter plot with size based on total questions
977 plt.scatter(
978 trial_durations,
979 trial_scores,
980 s=[
981 q * 5 for q in total_questions
982 ], # Size based on total questions
983 alpha=0.7,
984 c=range(len(trial_durations)),
985 cmap="viridis",
986 )
988 # Add labels
989 plt.xlabel("Trial Duration (seconds)")
990 plt.ylabel("Score")
991 plt.title("Trial Duration vs. Score")
992 plt.grid(True, linestyle="--", alpha=0.7)
994 # Add trial number annotations for selected points
995 for i, (d, s) in enumerate(
996 zip(trial_durations, trial_scores, strict=False)
997 ):
998 if (
999 i % max(1, len(trial_durations) // 5) == 0
1000 ): # Annotate ~5 points
1001 plt.annotate(
1002 f"{trial_iterations[i]}×{trial_questions[i]}",
1003 (d, s),
1004 xytext=(5, 5),
1005 textcoords="offset points",
1006 )
1008 # Save the figure
1009 plt.tight_layout()
1010 plt.savefig(
1011 str(
1012 Path(viz_dir)
1013 / f"{self.study_name}_duration_vs_score_{timestamp}.png"
1014 )
1015 )
1016 plt.close()
1017 except Exception as e:
1018 logger.exception(f"Error creating duration vs score plot: {e!s}")
1021def optimize_parameters(
1022 query: str,
1023 param_space: Optional[Dict[str, Any]] = None,
1024 output_dir: str = str(Path("data") / "optimization_results"),
1025 model_name: Optional[str] = None,
1026 provider: Optional[str] = None,
1027 search_tool: Optional[str] = None,
1028 temperature: float = 0.7,
1029 n_trials: int = 30,
1030 timeout: Optional[int] = None,
1031 n_jobs: int = 1,
1032 study_name: Optional[str] = None,
1033 optimization_metrics: Optional[List[str]] = None,
1034 metric_weights: Optional[Dict[str, float]] = None,
1035 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1036 benchmark_weights: Optional[Dict[str, float]] = None,
1037) -> Tuple[Dict[str, Any], float]:
1038 """
1039 Optimize parameters for Local Deep Research.
1041 Args:
1042 query: The research query to use for all experiments
1043 param_space: Dictionary defining parameter search spaces (optional)
1044 output_dir: Directory to save optimization results
1045 model_name: Name of the LLM model to use
1046 provider: LLM provider
1047 search_tool: Search engine to use
1048 temperature: LLM temperature
1049 n_trials: Number of parameter combinations to try
1050 timeout: Maximum seconds to run optimization (None for no limit)
1051 n_jobs: Number of parallel jobs for optimization
1052 study_name: Name of the Optuna study
1053 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])
1054 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})
1055 progress_callback: Optional callback for progress updates
1056 benchmark_weights: Dictionary mapping benchmark types to weights
1057 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1058 If None, only SimpleQA is used with weight 1.0
1060 Returns:
1061 Tuple of (best_parameters, best_score)
1062 """
1063 # Create optimizer
1064 optimizer = OptunaOptimizer(
1065 base_query=query,
1066 output_dir=output_dir,
1067 model_name=model_name,
1068 provider=provider,
1069 search_tool=search_tool,
1070 temperature=temperature,
1071 n_trials=n_trials,
1072 timeout=timeout,
1073 n_jobs=n_jobs,
1074 study_name=study_name,
1075 optimization_metrics=optimization_metrics,
1076 metric_weights=metric_weights,
1077 progress_callback=progress_callback,
1078 benchmark_weights=benchmark_weights,
1079 )
1081 # Run optimization
1082 return optimizer.optimize(param_space)
1085def optimize_for_speed(
1086 query: str,
1087 n_trials: int = 20,
1088 output_dir: str = str(Path("data") / "optimization_results"),
1089 model_name: Optional[str] = None,
1090 provider: Optional[str] = None,
1091 search_tool: Optional[str] = None,
1092 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1093 benchmark_weights: Optional[Dict[str, float]] = None,
1094) -> Tuple[Dict[str, Any], float]:
1095 """
1096 Optimize parameters with a focus on speed performance.
1098 Args:
1099 query: The research query to use for all experiments
1100 n_trials: Number of parameter combinations to try
1101 output_dir: Directory to save optimization results
1102 model_name: Name of the LLM model to use
1103 provider: LLM provider
1104 search_tool: Search engine to use
1105 progress_callback: Optional callback for progress updates
1106 benchmark_weights: Dictionary mapping benchmark types to weights
1107 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1108 If None, only SimpleQA is used with weight 1.0
1110 Returns:
1111 Tuple of (best_parameters, best_score)
1112 """
1113 # Focus on speed with reduced parameter space
1114 param_space = {
1115 "iterations": {
1116 "type": "int",
1117 "low": 1,
1118 "high": 3,
1119 "step": 1,
1120 },
1121 "questions_per_iteration": {
1122 "type": "int",
1123 "low": 1,
1124 "high": 3,
1125 "step": 1,
1126 },
1127 "search_strategy": {
1128 "type": "categorical",
1129 "choices": ["rapid", "parallel", "source_based"],
1130 },
1131 }
1133 # Speed-focused weights
1134 metric_weights = {"speed": 0.8, "quality": 0.2}
1136 return optimize_parameters(
1137 query=query,
1138 param_space=param_space,
1139 output_dir=output_dir,
1140 model_name=model_name,
1141 provider=provider,
1142 search_tool=search_tool,
1143 n_trials=n_trials,
1144 metric_weights=metric_weights,
1145 optimization_metrics=["speed", "quality"],
1146 progress_callback=progress_callback,
1147 benchmark_weights=benchmark_weights,
1148 )
1151def optimize_for_quality(
1152 query: str,
1153 n_trials: int = 30,
1154 output_dir: str = str(Path("data") / "optimization_results"),
1155 model_name: Optional[str] = None,
1156 provider: Optional[str] = None,
1157 search_tool: Optional[str] = None,
1158 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1159 benchmark_weights: Optional[Dict[str, float]] = None,
1160) -> Tuple[Dict[str, Any], float]:
1161 """
1162 Optimize parameters with a focus on result quality.
1164 Args:
1165 query: The research query to use for all experiments
1166 n_trials: Number of parameter combinations to try
1167 output_dir: Directory to save optimization results
1168 model_name: Name of the LLM model to use
1169 provider: LLM provider
1170 search_tool: Search engine to use
1171 progress_callback: Optional callback for progress updates
1172 benchmark_weights: Dictionary mapping benchmark types to weights
1173 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1174 If None, only SimpleQA is used with weight 1.0
1176 Returns:
1177 Tuple of (best_parameters, best_score)
1178 """
1179 # Quality-focused weights
1180 metric_weights = {"quality": 0.9, "speed": 0.1}
1182 return optimize_parameters(
1183 query=query,
1184 output_dir=output_dir,
1185 model_name=model_name,
1186 provider=provider,
1187 search_tool=search_tool,
1188 n_trials=n_trials,
1189 metric_weights=metric_weights,
1190 optimization_metrics=["quality", "speed"],
1191 progress_callback=progress_callback,
1192 benchmark_weights=benchmark_weights,
1193 )
1196def optimize_for_efficiency(
1197 query: str,
1198 n_trials: int = 25,
1199 output_dir: str = str(Path("data") / "optimization_results"),
1200 model_name: Optional[str] = None,
1201 provider: Optional[str] = None,
1202 search_tool: Optional[str] = None,
1203 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1204 benchmark_weights: Optional[Dict[str, float]] = None,
1205) -> Tuple[Dict[str, Any], float]:
1206 """
1207 Optimize parameters with a focus on resource efficiency.
1209 Args:
1210 query: The research query to use for all experiments
1211 n_trials: Number of parameter combinations to try
1212 output_dir: Directory to save optimization results
1213 model_name: Name of the LLM model to use
1214 provider: LLM provider
1215 search_tool: Search engine to use
1216 progress_callback: Optional callback for progress updates
1217 benchmark_weights: Dictionary mapping benchmark types to weights
1218 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1219 If None, only SimpleQA is used with weight 1.0
1221 Returns:
1222 Tuple of (best_parameters, best_score)
1223 """
1224 # Balance of quality, speed and resource usage
1225 metric_weights = {"quality": 0.4, "speed": 0.3, "resource": 0.3}
1227 return optimize_parameters(
1228 query=query,
1229 output_dir=output_dir,
1230 model_name=model_name,
1231 provider=provider,
1232 search_tool=search_tool,
1233 n_trials=n_trials,
1234 metric_weights=metric_weights,
1235 optimization_metrics=["quality", "speed", "resource"],
1236 progress_callback=progress_callback,
1237 benchmark_weights=benchmark_weights,
1238 )