Coverage for src / local_deep_research / benchmarks / optimization / optuna_optimizer.py: 65%
346 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Optuna-based parameter optimizer for Local Deep Research.
4This module provides the core optimization functionality using Optuna
5to find optimal parameters for the research system, balancing quality
6and performance metrics.
7"""
9import os
10from pathlib import Path
11import time
12from datetime import datetime, UTC
13from functools import partial
14from typing import Any, Callable, Dict, List, Optional, Tuple
16import joblib
17import numpy as np
18import optuna
19from optuna.visualization import (
20 plot_contour,
21 plot_optimization_history,
22 plot_param_importances,
23 plot_slice,
24)
26from local_deep_research.benchmarks.efficiency.speed_profiler import (
27 SpeedProfiler,
28)
29from local_deep_research.security import sanitize_data
30from loguru import logger
32from local_deep_research.benchmarks.evaluators import (
33 CompositeBenchmarkEvaluator,
34)
36# Import benchmark evaluator components
38# Try to import visualization libraries, but don't fail if not available
39try:
40 import matplotlib.pyplot as plt
41 from matplotlib.lines import Line2D
43 # We'll use matplotlib for plotting visualization results
45 PLOTTING_AVAILABLE = True
46except ImportError:
47 PLOTTING_AVAILABLE = False
48 logger.warning("Matplotlib not available, visualization will be limited")
51class OptunaOptimizer:
52 """
53 Optimize parameters for Local Deep Research using Optuna.
55 This class provides functionality to:
56 1. Define search spaces for parameter optimization
57 2. Evaluate parameter combinations using objective functions
58 3. Find optimal parameters via Optuna
59 4. Visualize and analyze optimization results
60 """
62 def __init__(
63 self,
64 base_query: str,
65 output_dir: str = "optimization_results",
66 model_name: Optional[str] = None,
67 provider: Optional[str] = None,
68 search_tool: Optional[str] = None,
69 temperature: float = 0.7,
70 n_trials: int = 30,
71 timeout: Optional[int] = None,
72 n_jobs: int = 1,
73 study_name: Optional[str] = None,
74 optimization_metrics: Optional[List[str]] = None,
75 metric_weights: Optional[Dict[str, float]] = None,
76 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
77 benchmark_weights: Optional[Dict[str, float]] = None,
78 ):
79 """
80 Initialize the optimizer.
82 Args:
83 base_query: The research query to use for all experiments
84 output_dir: Directory to save optimization results
85 model_name: Name of the LLM model to use
86 provider: LLM provider
87 search_tool: Search engine to use
88 temperature: LLM temperature
89 n_trials: Number of parameter combinations to try
90 timeout: Maximum seconds to run optimization (None for no limit)
91 n_jobs: Number of parallel jobs for optimization
92 study_name: Name of the Optuna study
93 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])
94 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})
95 progress_callback: Optional callback for progress updates
96 benchmark_weights: Dictionary mapping benchmark types to weights
97 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
98 If None, only SimpleQA is used with weight 1.0
99 """
100 self.base_query = base_query
101 self.output_dir = output_dir
102 self.model_name = model_name
103 self.provider = provider
104 self.search_tool = search_tool
105 self.temperature = temperature
106 self.n_trials = n_trials
107 self.timeout = timeout
108 self.n_jobs = n_jobs
109 self.optimization_metrics = optimization_metrics or ["quality", "speed"]
110 self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}
111 self.progress_callback = progress_callback
113 # Initialize benchmark evaluator with weights
114 self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
115 self.benchmark_evaluator = CompositeBenchmarkEvaluator(
116 self.benchmark_weights
117 )
119 # Normalize weights to sum to 1.0
120 total_weight = sum(self.metric_weights.values())
121 if total_weight > 0: 121 ↛ 127line 121 didn't jump to line 127 because the condition on line 121 was always true
122 self.metric_weights = {
123 k: v / total_weight for k, v in self.metric_weights.items()
124 }
126 # Generate a unique study name if not provided
127 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
128 self.study_name = study_name or f"ldr_opt_{timestamp}"
130 # Create output directory
131 os.makedirs(output_dir, exist_ok=True)
133 # Store the trial history for analysis
134 self.trials_history = []
136 # Storage for the best parameters and study
137 self.best_params = None
138 self.study = None
140 def optimize(
141 self, param_space: Optional[Dict[str, Any]] = None
142 ) -> Tuple[Dict[str, Any], float]:
143 """
144 Run the optimization process using Optuna.
146 Args:
147 param_space: Dictionary defining parameter search spaces
148 (if None, use default spaces)
150 Returns:
151 Tuple containing (best_parameters, best_score)
152 """
153 param_space = param_space or self._get_default_param_space()
155 # Create a study object
156 storage_name = f"sqlite:///{self.output_dir}/{self.study_name}.db"
157 self.study = optuna.create_study(
158 study_name=self.study_name,
159 storage=storage_name,
160 load_if_exists=True,
161 direction="maximize",
162 sampler=optuna.samplers.TPESampler(seed=42),
163 )
165 # Create partial function with param_space
166 objective = partial(self._objective, param_space=param_space)
168 # Log optimization start
169 logger.info(
170 f"Starting optimization with {self.n_trials} trials, {self.n_jobs} parallel jobs"
171 )
172 logger.info(f"Parameter space: {param_space}")
173 logger.info(f"Metric weights: {self.metric_weights}")
174 logger.info(f"Benchmark weights: {self.benchmark_weights}")
176 # Initialize progress tracking
177 if self.progress_callback: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 self.progress_callback(
179 0,
180 self.n_trials,
181 {
182 "status": "starting",
183 "stage": "initialization",
184 "trials_completed": 0,
185 "total_trials": self.n_trials,
186 },
187 )
189 try:
190 # Run optimization
191 self.study.optimize(
192 objective,
193 n_trials=self.n_trials,
194 timeout=self.timeout,
195 n_jobs=self.n_jobs,
196 callbacks=[self._optimization_callback],
197 show_progress_bar=True,
198 )
200 # Store best parameters
201 self.best_params = self.study.best_params
203 # Save the results
204 self._save_results()
206 # Create visualizations
207 self._create_visualizations()
209 logger.info(
210 f"Optimization complete. Best parameters: {self.best_params}"
211 )
212 logger.info(f"Best value: {self.study.best_value}")
214 # Report completion
215 if self.progress_callback: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 self.progress_callback(
217 self.n_trials,
218 self.n_trials,
219 {
220 "status": "completed",
221 "stage": "finished",
222 "trials_completed": len(self.study.trials),
223 "total_trials": self.n_trials,
224 "best_params": self.best_params,
225 "best_value": self.study.best_value,
226 },
227 )
229 return self.best_params, self.study.best_value
231 except KeyboardInterrupt:
232 logger.info("Optimization interrupted by user")
233 # Still save what we have
234 self._save_results()
235 self._create_visualizations()
237 # Report interruption
238 if self.progress_callback:
239 self.progress_callback(
240 len(self.study.trials),
241 self.n_trials,
242 {
243 "status": "interrupted",
244 "stage": "interrupted",
245 "trials_completed": len(self.study.trials),
246 "total_trials": self.n_trials,
247 "best_params": self.study.best_params,
248 "best_value": self.study.best_value,
249 },
250 )
252 return self.study.best_params, self.study.best_value
254 def _get_default_param_space(self) -> Dict[str, Any]:
255 """
256 Get default parameter search space.
258 Returns:
259 Dictionary defining the default parameter search spaces
260 """
261 return {
262 "iterations": {
263 "type": "int",
264 "low": 1,
265 "high": 5,
266 "step": 1,
267 },
268 "questions_per_iteration": {
269 "type": "int",
270 "low": 1,
271 "high": 5,
272 "step": 1,
273 },
274 "search_strategy": {
275 "type": "categorical",
276 "choices": [
277 "iterdrag",
278 "standard",
279 "rapid",
280 "parallel",
281 "source_based",
282 ],
283 },
284 "max_results": {
285 "type": "int",
286 "low": 10,
287 "high": 100,
288 "step": 10,
289 },
290 }
292 def _objective(
293 self, trial: optuna.Trial, param_space: Dict[str, Any]
294 ) -> float:
295 """
296 Objective function for Optuna optimization.
298 Args:
299 trial: Optuna trial object
300 param_space: Dictionary defining parameter search spaces
302 Returns:
303 Score to maximize
304 """
305 # Generate parameters for this trial
306 params = {}
307 for param_name, param_config in param_space.items():
308 param_type = param_config["type"]
310 if param_type == "int":
311 params[param_name] = trial.suggest_int(
312 param_name,
313 param_config["low"],
314 param_config["high"],
315 step=param_config.get("step", 1),
316 )
317 elif param_type == "float": 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true
318 params[param_name] = trial.suggest_float(
319 param_name,
320 param_config["low"],
321 param_config["high"],
322 step=param_config.get("step"),
323 log=param_config.get("log", False),
324 )
325 elif param_type == "categorical": 325 ↛ 307line 325 didn't jump to line 307 because the condition on line 325 was always true
326 params[param_name] = trial.suggest_categorical(
327 param_name, param_config["choices"]
328 )
330 # Log the trial parameters
331 logger.info(f"Trial {trial.number}: {params}")
333 # Update progress callback if available
334 if self.progress_callback: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true
335 self.progress_callback(
336 trial.number,
337 self.n_trials,
338 {
339 "status": "running",
340 "stage": "trial_started",
341 "trial_number": trial.number,
342 "params": params,
343 "trials_completed": trial.number,
344 "total_trials": self.n_trials,
345 },
346 )
348 # Run an experiment with these parameters
349 try:
350 start_time = time.time()
351 result = self._run_experiment(params)
352 duration = time.time() - start_time
354 # Store details about the trial
355 trial_info = {
356 "trial_number": trial.number,
357 "params": params,
358 "result": result,
359 "score": result.get("score", 0),
360 "duration": duration,
361 "timestamp": datetime.now(UTC).isoformat(),
362 }
363 self.trials_history.append(trial_info)
365 # Update callback with results
366 if self.progress_callback: 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true
367 self.progress_callback(
368 trial.number,
369 self.n_trials,
370 {
371 "status": "completed",
372 "stage": "trial_completed",
373 "trial_number": trial.number,
374 "params": params,
375 "score": result.get("score", 0),
376 "trials_completed": trial.number + 1,
377 "total_trials": self.n_trials,
378 },
379 )
381 logger.info(
382 f"Trial {trial.number} completed: {params}, score: {result['score']:.4f}"
383 )
385 return result["score"]
386 except Exception as e:
387 logger.exception(f"Error in trial {trial.number}")
389 # Update callback with error
390 if self.progress_callback: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true
391 self.progress_callback(
392 trial.number,
393 self.n_trials,
394 {
395 "status": "error",
396 "stage": "trial_error",
397 "trial_number": trial.number,
398 "params": params,
399 "error": str(e),
400 "trials_completed": trial.number,
401 "total_trials": self.n_trials,
402 },
403 )
405 return float("-inf") # Return a very low score for failed trials
407 def _run_experiment(self, params: Dict[str, Any]) -> Dict[str, Any]:
408 """
409 Run a single experiment with the given parameters.
411 Args:
412 params: Dictionary of parameters to test
414 Returns:
415 Results dictionary with metrics and score
416 """
417 # Extract parameters
418 iterations = params.get("iterations", 2)
419 questions_per_iteration = params.get("questions_per_iteration", 2)
420 search_strategy = params.get("search_strategy", "iterdrag")
421 max_results = params.get("max_results", 50)
423 # Initialize profiling tools
424 speed_profiler = SpeedProfiler()
426 # Start profiling
427 speed_profiler.start()
429 try:
430 # Create system configuration
431 system_config = {
432 "iterations": iterations,
433 "questions_per_iteration": questions_per_iteration,
434 "search_strategy": search_strategy,
435 "search_tool": self.search_tool,
436 "max_results": max_results,
437 "model_name": self.model_name,
438 "provider": self.provider,
439 }
441 # Evaluate quality using composite benchmark evaluator
442 # Use a small number of examples for efficiency
443 benchmark_dir = str(Path(self.output_dir) / "benchmark_temp")
444 quality_results = self.benchmark_evaluator.evaluate(
445 system_config=system_config,
446 num_examples=5, # Small number for optimization efficiency
447 output_dir=benchmark_dir,
448 )
450 # Stop timing
451 speed_profiler.stop()
452 timing_results = speed_profiler.get_summary()
454 # Extract key metrics
455 quality_score = quality_results.get("quality_score", 0.0)
456 benchmark_results = quality_results.get("benchmark_results", {})
458 # Speed score: convert duration to a 0-1 score where faster is better
459 # Using a reasonable threshold (e.g., 180 seconds for 5 examples)
460 # Below this threshold: high score, above it: declining score
461 total_duration = timing_results.get("total_duration", 180)
462 speed_score = max(0.0, min(1.0, 1.0 - (total_duration - 60) / 180))
464 # Calculate combined score based on weights
465 combined_score = (
466 self.metric_weights.get("quality", 0.6) * quality_score
467 + self.metric_weights.get("speed", 0.4) * speed_score
468 )
470 # Return streamlined results
471 return {
472 "quality_score": quality_score,
473 "benchmark_results": benchmark_results,
474 "speed_score": speed_score,
475 "total_duration": total_duration,
476 "score": combined_score,
477 "success": True,
478 }
480 except Exception as e:
481 # Stop profiling on error
482 speed_profiler.stop()
484 # Log error
485 logger.exception("Error in experiment")
487 # Return error information
488 return {"error": str(e), "score": 0.0, "success": False}
490 def _optimization_callback(self, study: optuna.Study, trial: optuna.Trial):
491 """
492 Callback for the Optuna optimization process.
494 Args:
495 study: Optuna study object
496 trial: Current trial
497 """
498 # Save intermediate results periodically
499 if trial.number % 10 == 0 and trial.number > 0:
500 self._save_results()
501 self._create_quick_visualizations()
503 def _save_results(self):
504 """Save the optimization results to disk."""
505 # Create a timestamp for filenames
506 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
508 # Save trial history
509 from ...security.file_write_verifier import write_json_verified
511 history_file = str(
512 Path(self.output_dir) / f"{self.study_name}_history.json"
513 )
515 # Convert numpy values to native Python types for JSON serialization
516 clean_history = []
517 for trial in self.trials_history:
518 clean_trial = {}
519 for k, v in trial.items():
520 if isinstance(v, dict):
521 clean_trial[k] = {
522 dk: (float(dv) if isinstance(dv, np.number) else dv)
523 for dk, dv in v.items()
524 }
525 elif isinstance(v, np.number): 525 ↛ 526line 525 didn't jump to line 526 because the condition on line 525 was never true
526 clean_trial[k] = float(v)
527 else:
528 clean_trial[k] = v
529 clean_history.append(clean_trial)
531 # Sanitize sensitive data before writing to disk
532 sanitized_history = sanitize_data(clean_history)
534 write_json_verified(
535 history_file,
536 sanitized_history,
537 "benchmark.allow_file_output",
538 context="optimization history",
539 )
541 # Save current best parameters
542 if ( 542 ↛ 575line 542 didn't jump to line 575 because the condition on line 542 was always true
543 self.study
544 and hasattr(self.study, "best_params")
545 and self.study.best_params
546 ):
547 best_params_file = str(
548 Path(self.output_dir) / f"{self.study_name}_best_params.json"
549 )
551 best_params_data = {
552 "best_params": self.study.best_params,
553 "best_value": float(self.study.best_value),
554 "n_trials": len(self.study.trials),
555 "timestamp": timestamp,
556 "base_query": self.base_query,
557 "model_name": self.model_name,
558 "provider": self.provider,
559 "search_tool": self.search_tool,
560 "metric_weights": self.metric_weights,
561 "benchmark_weights": self.benchmark_weights,
562 }
564 # Sanitize sensitive data before writing to disk
565 sanitized_best_params = sanitize_data(best_params_data)
567 write_json_verified(
568 best_params_file,
569 sanitized_best_params,
570 "benchmark.allow_file_output",
571 context="optimization best params",
572 )
574 # Save the Optuna study
575 if self.study: 575 ↛ 581line 575 didn't jump to line 581 because the condition on line 575 was always true
576 study_file = str(
577 Path(self.output_dir) / f"{self.study_name}_study.pkl"
578 )
579 joblib.dump(self.study, study_file)
581 logger.info(f"Results saved to {self.output_dir}")
583 def _create_visualizations(self):
584 """Create and save comprehensive visualizations of the optimization results."""
585 if not PLOTTING_AVAILABLE: 585 ↛ 586line 585 didn't jump to line 586 because the condition on line 585 was never true
586 logger.warning(
587 "Matplotlib not available, skipping visualization creation"
588 )
589 return
591 if not self.study or len(self.study.trials) < 2:
592 logger.warning("Not enough trials to create visualizations")
593 return
595 # Create directory for visualizations
596 viz_dir = Path(self.output_dir) / "visualizations"
597 viz_dir.mkdir(parents=True, exist_ok=True)
598 viz_dir = str(viz_dir)
600 # Create Optuna visualizations
601 self._create_optuna_visualizations(viz_dir)
603 # Create custom visualizations
604 self._create_custom_visualizations(viz_dir)
606 logger.info(f"Visualizations saved to {viz_dir}")
608 def _create_quick_visualizations(self):
609 """Create a smaller set of visualizations for intermediate progress."""
610 if (
611 not PLOTTING_AVAILABLE
612 or not self.study
613 or len(self.study.trials) < 2
614 ):
615 return
617 # Create directory for visualizations
618 viz_dir = Path(self.output_dir) / "visualizations"
619 viz_dir.mkdir(parents=True, exist_ok=True)
620 viz_dir = str(viz_dir)
622 # Create optimization history only (faster than full visualization)
623 try:
624 fig = plot_optimization_history(self.study)
625 fig.write_image(
626 str(
627 Path(viz_dir)
628 / f"{self.study_name}_optimization_history_current.png"
629 )
630 )
631 except Exception:
632 logger.exception("Error creating optimization history plot")
634 def _create_optuna_visualizations(self, viz_dir: str):
635 """
636 Create and save Optuna's built-in visualizations.
638 Args:
639 viz_dir: Directory to save visualizations
640 """
641 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
643 # 1. Optimization history
644 try:
645 fig = plot_optimization_history(self.study)
646 fig.write_image(
647 str(
648 Path(viz_dir)
649 / f"{self.study_name}_optimization_history_{timestamp}.png"
650 )
651 )
652 except Exception:
653 logger.exception("Error creating optimization history plot")
655 # 2. Parameter importances
656 try:
657 fig = plot_param_importances(self.study)
658 fig.write_image(
659 str(
660 Path(viz_dir)
661 / f"{self.study_name}_param_importances_{timestamp}.png"
662 )
663 )
664 except Exception:
665 logger.exception("Error creating parameter importances plot")
667 # 3. Slice plot for each parameter
668 try:
669 for param_name in self.study.best_params.keys():
670 fig = plot_slice(self.study, [param_name])
671 fig.write_image(
672 str(
673 Path(viz_dir)
674 / f"{self.study_name}_slice_{param_name}_{timestamp}.png"
675 )
676 )
677 except Exception:
678 logger.exception("Error creating slice plots")
680 # 4. Contour plots for important parameter pairs
681 try:
682 # Get all parameter names
683 param_names = list(self.study.best_params.keys())
685 # Create contour plots for each pair
686 for i in range(len(param_names)):
687 for j in range(i + 1, len(param_names)): 687 ↛ 688line 687 didn't jump to line 688 because the loop on line 687 never started
688 try:
689 fig = plot_contour(
690 self.study, params=[param_names[i], param_names[j]]
691 )
692 fig.write_image(
693 str(
694 Path(viz_dir)
695 / f"{self.study_name}_contour_{param_names[i]}_{param_names[j]}_{timestamp}.png"
696 )
697 )
698 except Exception as e:
699 logger.warning(
700 f"Error creating contour plot for {param_names[i]} vs {param_names[j]}: {e!s}"
701 )
702 except Exception:
703 logger.exception("Error creating contour plots")
705 def _create_custom_visualizations(self, viz_dir: str):
706 """
707 Create custom visualizations based on trial history.
709 Args:
710 viz_dir: Directory to save visualizations
711 """
712 if not self.trials_history: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 return
715 timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
717 # Create quality vs speed plot
718 self._create_quality_vs_speed_plot(viz_dir, timestamp)
720 # Create parameter evolution plots
721 self._create_parameter_evolution_plots(viz_dir, timestamp)
723 # Create trial duration vs score plot
724 self._create_duration_vs_score_plot(viz_dir, timestamp)
726 def _create_quality_vs_speed_plot(self, viz_dir: str, timestamp: str):
727 """Create a plot showing quality vs. speed trade-off."""
728 if not self.trials_history: 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true
729 return
731 # Extract data from successful trials
732 successful_trials = [
733 t
734 for t in self.trials_history
735 if t.get("result", {}).get("success", False)
736 ]
738 if not successful_trials: 738 ↛ 739line 738 didn't jump to line 739 because the condition on line 738 was never true
739 logger.warning("No successful trials for visualization")
740 return
742 try:
743 plt.figure(figsize=(10, 8))
745 # Extract metrics
746 quality_scores = []
747 speed_scores = []
748 labels = []
749 iterations_values = []
750 questions_values = []
752 for trial in successful_trials: 752 ↛ 766line 752 didn't jump to line 766 because the loop on line 752 didn't complete
753 result = trial["result"]
754 quality = result.get("quality_score", 0)
755 speed = result.get("speed_score", 0)
756 iterations = trial["params"].get("iterations", 0)
757 questions = trial["params"].get("questions_per_iteration", 0)
759 quality_scores.append(quality)
760 speed_scores.append(speed)
761 labels.append(f"Trial {trial['trial_number']}")
762 iterations_values.append(iterations)
763 questions_values.append(questions)
765 # Create scatter plot with size based on iterations*questions
766 sizes = [
767 i * q * 5
768 for i, q in zip(
769 iterations_values, questions_values, strict=False
770 )
771 ]
772 scatter = plt.scatter(
773 quality_scores,
774 speed_scores,
775 s=sizes,
776 alpha=0.7,
777 c=range(len(quality_scores)),
778 cmap="viridis",
779 )
781 # Highlight best trial
782 best_trial = max(
783 successful_trials,
784 key=lambda x: x.get("result", {}).get("score", 0),
785 )
786 best_quality = best_trial["result"].get("quality_score", 0)
787 best_speed = best_trial["result"].get("speed_score", 0)
788 best_iter = best_trial["params"].get("iterations", 0)
789 best_questions = best_trial["params"].get(
790 "questions_per_iteration", 0
791 )
793 plt.scatter(
794 [best_quality],
795 [best_speed],
796 s=200,
797 facecolors="none",
798 edgecolors="red",
799 linewidth=2,
800 label=f"Best: {best_iter}×{best_questions}",
801 )
803 # Add annotations for key points
804 for i, (q, s, label) in enumerate(
805 zip(quality_scores, speed_scores, labels, strict=False)
806 ):
807 if i % max(1, len(quality_scores) // 5) == 0: # Label ~5 points
808 plt.annotate(
809 f"{iterations_values[i]}×{questions_values[i]}",
810 (q, s),
811 xytext=(5, 5),
812 textcoords="offset points",
813 )
815 # Add colorbar and labels
816 cbar = plt.colorbar(scatter)
817 cbar.set_label("Trial Progression")
819 # Add benchmark weight information
820 weights_str = ", ".join(
821 [f"{k}:{v:.1f}" for k, v in self.benchmark_weights.items()]
822 )
823 plt.title(
824 f"Quality vs. Speed Trade-off\nBenchmark Weights: {weights_str}"
825 )
826 plt.xlabel("Quality Score (Benchmark Accuracy)")
827 plt.ylabel("Speed Score")
828 plt.grid(True, linestyle="--", alpha=0.7)
830 # Add legend explaining size
831 legend_elements = [
832 Line2D(
833 [0],
834 [0],
835 marker="o",
836 color="w",
837 markerfacecolor="gray",
838 markersize=np.sqrt(n * 5 / np.pi),
839 label=f"{n} Total Questions",
840 )
841 for n in [5, 10, 15, 20, 25]
842 ]
843 plt.legend(handles=legend_elements, title="Workload")
845 # Save the figure
846 plt.tight_layout()
847 plt.savefig(
848 str(
849 Path(viz_dir)
850 / f"{self.study_name}_quality_vs_speed_{timestamp}.png"
851 )
852 )
853 plt.close()
854 except Exception:
855 logger.exception("Error creating quality vs speed plot")
857 def _create_parameter_evolution_plots(self, viz_dir: str, timestamp: str):
858 """Create plots showing how parameter values evolve over trials."""
859 try:
860 successful_trials = [
861 t
862 for t in self.trials_history
863 if t.get("result", {}).get("success", False)
864 ]
866 if not successful_trials or len(successful_trials) < 5: 866 ↛ 870line 866 didn't jump to line 870 because the condition on line 866 was always true
867 return
869 # Get key parameters
870 main_params = list(successful_trials[0]["params"].keys())
872 # For each parameter, plot its values over trials
873 for param_name in main_params:
874 plt.figure(figsize=(12, 6))
876 trial_numbers = []
877 param_values = []
878 scores = []
880 for trial in self.trials_history:
881 if "params" in trial and param_name in trial["params"]:
882 trial_numbers.append(trial["trial_number"])
883 param_values.append(trial["params"][param_name])
884 scores.append(trial.get("score", 0))
886 # Create evolution plot
887 scatter = plt.scatter(
888 trial_numbers,
889 param_values,
890 c=scores,
891 cmap="plasma",
892 alpha=0.8,
893 s=80,
894 )
896 # Add best trial marker
897 best_trial_idx = scores.index(max(scores))
898 plt.scatter(
899 [trial_numbers[best_trial_idx]],
900 [param_values[best_trial_idx]],
901 s=150,
902 facecolors="none",
903 edgecolors="red",
904 linewidth=2,
905 label=f"Best Value: {param_values[best_trial_idx]}",
906 )
908 # Add colorbar
909 cbar = plt.colorbar(scatter)
910 cbar.set_label("Score")
912 # Set chart properties
913 plt.title(f"Evolution of {param_name} Values")
914 plt.xlabel("Trial Number")
915 plt.ylabel(param_name)
916 plt.grid(True, linestyle="--", alpha=0.7)
917 plt.legend()
919 # For categorical parameters, adjust y-axis
920 if isinstance(param_values[0], str):
921 unique_values = sorted(set(param_values))
922 plt.yticks(range(len(unique_values)), unique_values)
924 # Save the figure
925 plt.tight_layout()
926 plt.savefig(
927 str(
928 Path(viz_dir)
929 / f"{self.study_name}_param_evolution_{param_name}_{timestamp}.png"
930 )
931 )
932 plt.close()
933 except Exception:
934 logger.exception("Error creating parameter evolution plots")
936 def _create_duration_vs_score_plot(self, viz_dir: str, timestamp: str):
937 """Create a plot showing trial duration vs score."""
938 try:
939 plt.figure(figsize=(10, 6))
941 successful_trials = [
942 t
943 for t in self.trials_history
944 if t.get("result", {}).get("success", False)
945 ]
947 if not successful_trials: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true
948 return
950 trial_durations = []
951 trial_scores = []
952 trial_iterations = []
953 trial_questions = []
955 for trial in successful_trials:
956 duration = trial.get("duration", 0)
957 score = trial.get("score", 0)
958 iterations = trial.get("params", {}).get("iterations", 1)
959 questions = trial.get("params", {}).get(
960 "questions_per_iteration", 1
961 )
963 trial_durations.append(duration)
964 trial_scores.append(score)
965 trial_iterations.append(iterations)
966 trial_questions.append(questions)
968 # Total questions per trial
969 total_questions = [
970 i * q
971 for i, q in zip(trial_iterations, trial_questions, strict=False)
972 ]
974 # Create scatter plot with size based on total questions
975 plt.scatter(
976 trial_durations,
977 trial_scores,
978 s=[
979 q * 5 for q in total_questions
980 ], # Size based on total questions
981 alpha=0.7,
982 c=range(len(trial_durations)),
983 cmap="viridis",
984 )
986 # Add labels
987 plt.xlabel("Trial Duration (seconds)")
988 plt.ylabel("Score")
989 plt.title("Trial Duration vs. Score")
990 plt.grid(True, linestyle="--", alpha=0.7)
992 # Add trial number annotations for selected points
993 for i, (d, s) in enumerate(
994 zip(trial_durations, trial_scores, strict=False)
995 ):
996 if ( 996 ↛ 993line 996 didn't jump to line 993 because the condition on line 996 was always true
997 i % max(1, len(trial_durations) // 5) == 0
998 ): # Annotate ~5 points
999 plt.annotate(
1000 f"{trial_iterations[i]}×{trial_questions[i]}",
1001 (d, s),
1002 xytext=(5, 5),
1003 textcoords="offset points",
1004 )
1006 # Save the figure
1007 plt.tight_layout()
1008 plt.savefig(
1009 str(
1010 Path(viz_dir)
1011 / f"{self.study_name}_duration_vs_score_{timestamp}.png"
1012 )
1013 )
1014 plt.close()
1015 except Exception:
1016 logger.exception("Error creating duration vs score plot")
1019def optimize_parameters(
1020 query: str,
1021 param_space: Optional[Dict[str, Any]] = None,
1022 output_dir: str = str(Path("data") / "optimization_results"),
1023 model_name: Optional[str] = None,
1024 provider: Optional[str] = None,
1025 search_tool: Optional[str] = None,
1026 temperature: float = 0.7,
1027 n_trials: int = 30,
1028 timeout: Optional[int] = None,
1029 n_jobs: int = 1,
1030 study_name: Optional[str] = None,
1031 optimization_metrics: Optional[List[str]] = None,
1032 metric_weights: Optional[Dict[str, float]] = None,
1033 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1034 benchmark_weights: Optional[Dict[str, float]] = None,
1035) -> Tuple[Dict[str, Any], float]:
1036 """
1037 Optimize parameters for Local Deep Research.
1039 Args:
1040 query: The research query to use for all experiments
1041 param_space: Dictionary defining parameter search spaces (optional)
1042 output_dir: Directory to save optimization results
1043 model_name: Name of the LLM model to use
1044 provider: LLM provider
1045 search_tool: Search engine to use
1046 temperature: LLM temperature
1047 n_trials: Number of parameter combinations to try
1048 timeout: Maximum seconds to run optimization (None for no limit)
1049 n_jobs: Number of parallel jobs for optimization
1050 study_name: Name of the Optuna study
1051 optimization_metrics: List of metrics to optimize (default: ["quality", "speed"])
1052 metric_weights: Dictionary of weights for each metric (e.g., {"quality": 0.6, "speed": 0.4})
1053 progress_callback: Optional callback for progress updates
1054 benchmark_weights: Dictionary mapping benchmark types to weights
1055 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1056 If None, only SimpleQA is used with weight 1.0
1058 Returns:
1059 Tuple of (best_parameters, best_score)
1060 """
1061 # Create optimizer
1062 optimizer = OptunaOptimizer(
1063 base_query=query,
1064 output_dir=output_dir,
1065 model_name=model_name,
1066 provider=provider,
1067 search_tool=search_tool,
1068 temperature=temperature,
1069 n_trials=n_trials,
1070 timeout=timeout,
1071 n_jobs=n_jobs,
1072 study_name=study_name,
1073 optimization_metrics=optimization_metrics,
1074 metric_weights=metric_weights,
1075 progress_callback=progress_callback,
1076 benchmark_weights=benchmark_weights,
1077 )
1079 # Run optimization
1080 return optimizer.optimize(param_space)
1083def optimize_for_speed(
1084 query: str,
1085 n_trials: int = 20,
1086 output_dir: str = str(Path("data") / "optimization_results"),
1087 model_name: Optional[str] = None,
1088 provider: Optional[str] = None,
1089 search_tool: Optional[str] = None,
1090 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1091 benchmark_weights: Optional[Dict[str, float]] = None,
1092) -> Tuple[Dict[str, Any], float]:
1093 """
1094 Optimize parameters with a focus on speed performance.
1096 Args:
1097 query: The research query to use for all experiments
1098 n_trials: Number of parameter combinations to try
1099 output_dir: Directory to save optimization results
1100 model_name: Name of the LLM model to use
1101 provider: LLM provider
1102 search_tool: Search engine to use
1103 progress_callback: Optional callback for progress updates
1104 benchmark_weights: Dictionary mapping benchmark types to weights
1105 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1106 If None, only SimpleQA is used with weight 1.0
1108 Returns:
1109 Tuple of (best_parameters, best_score)
1110 """
1111 # Focus on speed with reduced parameter space
1112 param_space = {
1113 "iterations": {
1114 "type": "int",
1115 "low": 1,
1116 "high": 3,
1117 "step": 1,
1118 },
1119 "questions_per_iteration": {
1120 "type": "int",
1121 "low": 1,
1122 "high": 3,
1123 "step": 1,
1124 },
1125 "search_strategy": {
1126 "type": "categorical",
1127 "choices": ["rapid", "parallel", "source_based"],
1128 },
1129 }
1131 # Speed-focused weights
1132 metric_weights = {"speed": 0.8, "quality": 0.2}
1134 return optimize_parameters(
1135 query=query,
1136 param_space=param_space,
1137 output_dir=output_dir,
1138 model_name=model_name,
1139 provider=provider,
1140 search_tool=search_tool,
1141 n_trials=n_trials,
1142 metric_weights=metric_weights,
1143 optimization_metrics=["speed", "quality"],
1144 progress_callback=progress_callback,
1145 benchmark_weights=benchmark_weights,
1146 )
1149def optimize_for_quality(
1150 query: str,
1151 n_trials: int = 30,
1152 output_dir: str = str(Path("data") / "optimization_results"),
1153 model_name: Optional[str] = None,
1154 provider: Optional[str] = None,
1155 search_tool: Optional[str] = None,
1156 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1157 benchmark_weights: Optional[Dict[str, float]] = None,
1158) -> Tuple[Dict[str, Any], float]:
1159 """
1160 Optimize parameters with a focus on result quality.
1162 Args:
1163 query: The research query to use for all experiments
1164 n_trials: Number of parameter combinations to try
1165 output_dir: Directory to save optimization results
1166 model_name: Name of the LLM model to use
1167 provider: LLM provider
1168 search_tool: Search engine to use
1169 progress_callback: Optional callback for progress updates
1170 benchmark_weights: Dictionary mapping benchmark types to weights
1171 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1172 If None, only SimpleQA is used with weight 1.0
1174 Returns:
1175 Tuple of (best_parameters, best_score)
1176 """
1177 # Quality-focused weights
1178 metric_weights = {"quality": 0.9, "speed": 0.1}
1180 return optimize_parameters(
1181 query=query,
1182 output_dir=output_dir,
1183 model_name=model_name,
1184 provider=provider,
1185 search_tool=search_tool,
1186 n_trials=n_trials,
1187 metric_weights=metric_weights,
1188 optimization_metrics=["quality", "speed"],
1189 progress_callback=progress_callback,
1190 benchmark_weights=benchmark_weights,
1191 )
1194def optimize_for_efficiency(
1195 query: str,
1196 n_trials: int = 25,
1197 output_dir: str = str(Path("data") / "optimization_results"),
1198 model_name: Optional[str] = None,
1199 provider: Optional[str] = None,
1200 search_tool: Optional[str] = None,
1201 progress_callback: Optional[Callable[[int, int, Dict], None]] = None,
1202 benchmark_weights: Optional[Dict[str, float]] = None,
1203) -> Tuple[Dict[str, Any], float]:
1204 """
1205 Optimize parameters with a focus on resource efficiency.
1207 Args:
1208 query: The research query to use for all experiments
1209 n_trials: Number of parameter combinations to try
1210 output_dir: Directory to save optimization results
1211 model_name: Name of the LLM model to use
1212 provider: LLM provider
1213 search_tool: Search engine to use
1214 progress_callback: Optional callback for progress updates
1215 benchmark_weights: Dictionary mapping benchmark types to weights
1216 (e.g., {"simpleqa": 0.6, "browsecomp": 0.4})
1217 If None, only SimpleQA is used with weight 1.0
1219 Returns:
1220 Tuple of (best_parameters, best_score)
1221 """
1222 # Balance of quality, speed and resource usage
1223 metric_weights = {"quality": 0.4, "speed": 0.3, "resource": 0.3}
1225 return optimize_parameters(
1226 query=query,
1227 output_dir=output_dir,
1228 model_name=model_name,
1229 provider=provider,
1230 search_tool=search_tool,
1231 n_trials=n_trials,
1232 metric_weights=metric_weights,
1233 optimization_metrics=["quality", "speed", "resource"],
1234 progress_callback=progress_callback,
1235 benchmark_weights=benchmark_weights,
1236 )