Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 99%
113 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Configuration file for search engines.
3Loads search engine definitions from the user's configuration.
4"""
6from typing import Any, Dict, Optional
7from sqlalchemy.orm import Session
9from loguru import logger
11from ..config.thread_settings import get_setting_from_snapshot
12from ..utilities.db_utils import get_settings_manager
15def _get_setting(
16 key: str,
17 default_value: Any = None,
18 db_session: Optional[Session] = None,
19 settings_snapshot: Optional[Dict[str, Any]] = None,
20 username: Optional[str] = None,
21) -> Any:
22 """
23 Get a setting from either a database session or settings snapshot.
25 Args:
26 key: The setting key
27 default_value: Default value if setting not found
28 db_session: Database session for direct access
29 settings_snapshot: Settings snapshot for thread context
30 username: Username for backward compatibility
32 Returns:
33 The setting value or default_value if not found
34 """
35 # Try settings snapshot first (thread context)
36 if settings_snapshot:
37 try:
38 return get_setting_from_snapshot(
39 key, default_value, settings_snapshot=settings_snapshot
40 )
41 except Exception as e:
42 logger.debug(f"Could not get setting {key} from snapshot: {e}")
44 # Try database session if available
45 if db_session:
46 try:
47 settings_manager = get_settings_manager(db_session, username)
48 return settings_manager.get_setting(key, default_value)
49 except Exception as e:
50 logger.debug(f"Could not get setting {key} from db_session: {e}")
52 # Return default if all methods fail
53 logger.warning(
54 f"Could not retrieve setting '{key}', returning default: {default_value}"
55 )
56 return default_value
59def _extract_per_engine_config(
60 raw_config: Dict[str, Any],
61) -> Dict[str, Dict[str, Any]]:
62 """
63 Converts the "flat" configuration loaded from the settings database into
64 individual settings dictionaries for each engine.
66 Args:
67 raw_config: The raw "flat" configuration.
69 Returns:
70 Configuration dictionaries indexed by engine name.
72 """
73 nested_config: dict[str, Any] = {}
74 for key, value in raw_config.items():
75 if "." in key:
76 # This is a higher-level key.
77 top_level_key = key.split(".")[0]
78 lower_keys = ".".join(key.split(".")[1:])
79 nested_config.setdefault(top_level_key, {})[lower_keys] = value
80 else:
81 # This is a low-level key.
82 nested_config[key] = value
84 # Expand all the lower-level keys.
85 for key, value in nested_config.items():
86 if isinstance(value, dict):
87 # Expand the child keys.
88 nested_config[key] = _extract_per_engine_config(value)
90 return nested_config
93def search_config(
94 username: Optional[str] = None,
95 db_session: Optional[Session] = None,
96 settings_snapshot: Optional[Dict[str, Any]] = None,
97) -> Dict[str, Any]:
98 """
99 Returns the search engine configuration loaded from the database or settings snapshot.
101 Args:
102 username: Username for backward compatibility (deprecated)
103 db_session: Database session for direct access (preferred for web routes)
104 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
106 Returns:
107 The search engine configuration loaded from the database or snapshot.
108 """
109 # Extract search engine definitions
110 config_data = _get_setting(
111 "search.engine.web",
112 {},
113 db_session=db_session,
114 settings_snapshot=settings_snapshot,
115 username=username,
116 )
118 search_engines = _extract_per_engine_config(config_data)
120 search_engines["auto"] = _get_setting(
121 "search.engine.auto",
122 {},
123 db_session=db_session,
124 settings_snapshot=settings_snapshot,
125 username=username,
126 )
128 # Inject module/class from the hardcoded engine registry.
129 # This is the single source of truth for which Python module implements
130 # each engine — these values are never read from the settings DB.
131 from .engine_registry import ENGINE_REGISTRY
133 for name, entry in ENGINE_REGISTRY.items():
134 if name in search_engines:
135 search_engines[name]["module_path"] = entry.module_path
136 search_engines[name]["class_name"] = entry.class_name
137 if entry.full_search_module:
138 search_engines[name]["full_search_module"] = (
139 entry.full_search_module
140 )
141 search_engines[name]["full_search_class"] = (
142 entry.full_search_class
143 )
145 # Add registered retrievers as available search engines
146 from .retriever_registry import retriever_registry
148 for name in retriever_registry.list_registered():
149 search_engines[name] = {
150 "module_path": ".engines.search_engine_retriever",
151 "class_name": "RetrieverSearchEngine",
152 "requires_api_key": False,
153 "requires_llm": False,
154 "description": f"LangChain retriever: {name}",
155 "strengths": [
156 "Domain-specific knowledge",
157 "No rate limits",
158 "Fast retrieval",
159 ],
160 "weaknesses": ["Limited to indexed content"],
161 "supports_full_search": True,
162 "is_retriever": True, # Mark as retriever for identification
163 }
165 logger.info(
166 f"Loaded {len(search_engines)} search engines from configuration file"
167 )
168 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n")
170 # Add alias for 'auto' if it exists
171 if "auto" in search_engines and "meta" not in search_engines:
172 search_engines["meta"] = search_engines["auto"]
174 # Register Library RAG as a search engine
175 library_enabled = _get_setting(
176 "search.engine.library.enabled",
177 True,
178 db_session=db_session,
179 settings_snapshot=settings_snapshot,
180 username=username,
181 )
183 if library_enabled:
184 search_engines["library"] = {
185 "module_path": ".engines.search_engine_library",
186 "class_name": "LibraryRAGSearchEngine",
187 "requires_llm": True,
188 "display_name": "Search All Collections",
189 "default_params": {},
190 "description": "Search across all your document collections using semantic search",
191 "strengths": [
192 "Searches all your curated collections of research papers and documents",
193 "Uses semantic search for better relevance",
194 "Returns documents you've already saved and reviewed",
195 ],
196 "weaknesses": [
197 "Limited to documents already in your collections",
198 "Requires documents to be indexed first",
199 ],
200 "reliability": "High - searches all your collections",
201 }
202 logger.info("Registered Library RAG as search engine")
204 # Register document collections as individual search engines
205 if library_enabled:
206 try:
207 from ..database.models.library import Collection
208 from ..database.session_context import get_user_db_session
210 # Get username from settings_snapshot if available
211 collection_username = (
212 settings_snapshot.get("_username")
213 if settings_snapshot
214 else username
215 )
217 if collection_username:
218 with get_user_db_session(collection_username) as session:
219 collections = session.query(Collection).all()
221 for collection in collections:
222 engine_id = f"collection_{collection.id}"
223 # Add suffix to distinguish from the all-collections search
224 display_name = f"{collection.name} (Collection)"
225 search_engines[engine_id] = {
226 "module_path": ".engines.search_engine_collection",
227 "class_name": "CollectionSearchEngine",
228 "requires_llm": True,
229 "is_local": True,
230 "display_name": display_name,
231 "default_params": {
232 "collection_id": collection.id,
233 "collection_name": collection.name,
234 },
235 "description": (
236 collection.description
237 if collection.description
238 else f"Search documents in {collection.name} collection only"
239 ),
240 "strengths": [
241 f"Searches only documents in {collection.name}",
242 "Focused semantic search within specific topic area",
243 "Returns documents from a curated collection",
244 ],
245 "weaknesses": [
246 "Limited to documents in this collection",
247 "Smaller result pool than full library search",
248 ],
249 "reliability": "High - searches a specific collection",
250 }
252 logger.info(
253 f"Registered {len(collections)} document collections as search engines"
254 )
255 else:
256 logger.debug(
257 "No username available for collection registration"
258 )
259 except Exception:
260 logger.warning("Could not register document collections")
262 return search_engines
265def get_available_engines(
266 settings_snapshot: Optional[Dict[str, Any]] = None,
267 use_api_key_services: bool = True,
268 exclude_engines: Optional[set] = None,
269) -> Dict[str, Any]:
270 """
271 Return search engines that are actually usable: enabled for auto-search
272 and with valid API keys when required.
274 This is the single shared filter used by MetaSearchEngine,
275 ParallelSearchEngine, and MCPSearchStrategy so they all agree on which
276 engines are available.
278 Args:
279 settings_snapshot: Thread-safe settings snapshot.
280 use_api_key_services: If False, engines that require an API key are
281 excluded even when the key is present.
282 exclude_engines: Additional engine names to skip (e.g. the caller's
283 own name).
285 Returns:
286 Dict of engine_name → config for engines that passed all checks.
287 """
288 if not settings_snapshot:
289 logger.warning(
290 "get_available_engines called without settings_snapshot, "
291 "returning empty dict"
292 )
293 return {}
295 all_engines = search_config(settings_snapshot=settings_snapshot)
296 excluded = {"meta", "auto", "parallel"}
297 if exclude_engines: 297 ↛ 298line 297 didn't jump to line 298 because the condition on line 297 was never true
298 excluded |= exclude_engines
300 available: Dict[str, Any] = {}
302 for name, config in all_engines.items():
303 if name in excluded:
304 continue
306 # Check use_in_auto_search setting (default False)
307 auto_search_key = f"search.engine.web.{name}.use_in_auto_search"
308 use_in_auto = get_setting_from_snapshot(
309 auto_search_key, False, settings_snapshot=settings_snapshot
310 )
311 if not use_in_auto:
312 continue
314 requires_key = config.get("requires_api_key", False)
316 # Honour the use_api_key_services flag
317 if requires_key and not use_api_key_services:
318 continue
320 # Validate the API key is actually present
321 if requires_key:
322 api_key = _resolve_api_key(name, config, settings_snapshot)
323 if not api_key:
324 logger.debug(
325 f"Skipping {name} — requires API key but none configured"
326 )
327 continue
329 available[name] = config
331 return available
334def _resolve_api_key(
335 engine_name: str,
336 engine_config: Dict[str, Any],
337 settings_snapshot: Dict[str, Any],
338) -> Optional[str]:
339 """
340 Try to find a valid API key for *engine_name*.
342 Resolution order (mirrors ``create_search_engine``):
343 1. ``search.engine.web.<name>.api_key`` in the snapshot
344 2. ``api_key`` inside the engine config dict
346 Returns the key string or None.
347 """
348 api_key = None
349 api_key_path = f"search.engine.web.{engine_name}.api_key"
351 api_key_setting = settings_snapshot.get(api_key_path)
352 if api_key_setting:
353 api_key = (
354 api_key_setting.get("value")
355 if isinstance(api_key_setting, dict)
356 else api_key_setting
357 )
359 if not api_key:
360 api_key = engine_config.get("api_key")
362 if not api_key:
363 return None
365 # Reject common placeholder values
366 api_key_str = str(api_key).strip()
367 if (
368 not api_key_str
369 or api_key_str in ("None", "PLACEHOLDER", "YOUR_API_KEY_HERE", "null")
370 or api_key_str.endswith("_API_KEY")
371 or api_key_str.startswith("YOUR_")
372 ):
373 return None
375 return api_key_str
378def default_search_engine(
379 username: Optional[str] = None,
380 db_session: Optional[Session] = None,
381 settings_snapshot: Optional[Dict[str, Any]] = None,
382) -> str:
383 """
384 Returns the configured default search engine.
386 Args:
387 username: Username for backward compatibility (deprecated)
388 db_session: Database session for direct access (preferred for web routes)
389 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
391 Returns:
392 The configured default search engine.
393 """
394 return str(
395 _get_setting(
396 "search.engine.DEFAULT_SEARCH_ENGINE",
397 "wikipedia",
398 db_session=db_session,
399 settings_snapshot=settings_snapshot,
400 username=username,
401 )
402 )