Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 98%
101 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Configuration file for search engines.
3Loads search engine definitions from the user's configuration.
4"""
6import json
7from typing import Any, Dict, List, Optional
8from sqlalchemy.orm import Session
10from loguru import logger
12from ..config.thread_settings import get_setting_from_snapshot
13from ..utilities.db_utils import get_settings_manager
16def _get_setting(
17 key: str,
18 default_value: Any = None,
19 db_session: Optional[Session] = None,
20 settings_snapshot: Optional[Dict[str, Any]] = None,
21 username: Optional[str] = None,
22) -> Any:
23 """
24 Get a setting from either a database session or settings snapshot.
26 Args:
27 key: The setting key
28 default_value: Default value if setting not found
29 db_session: Database session for direct access
30 settings_snapshot: Settings snapshot for thread context
31 username: Username for backward compatibility
33 Returns:
34 The setting value or default_value if not found
35 """
36 # Try settings snapshot first (thread context)
37 if settings_snapshot:
38 try:
39 return get_setting_from_snapshot(
40 key, default_value, settings_snapshot=settings_snapshot
41 )
42 except Exception as e:
43 logger.debug(f"Could not get setting {key} from snapshot: {e}")
45 # Try database session if available
46 if db_session:
47 try:
48 settings_manager = get_settings_manager(db_session, username)
49 return settings_manager.get_setting(key, default_value)
50 except Exception as e:
51 logger.debug(f"Could not get setting {key} from db_session: {e}")
53 # Return default if all methods fail
54 logger.warning(
55 f"Could not retrieve setting '{key}', returning default: {default_value}"
56 )
57 return default_value
60def _extract_per_engine_config(
61 raw_config: Dict[str, Any],
62) -> Dict[str, Dict[str, Any]]:
63 """
64 Converts the "flat" configuration loaded from the settings database into
65 individual settings dictionaries for each engine.
67 Args:
68 raw_config: The raw "flat" configuration.
70 Returns:
71 Configuration dictionaries indexed by engine name.
73 """
74 nested_config = {}
75 for key, value in raw_config.items():
76 if "." in key:
77 # This is a higher-level key.
78 top_level_key = key.split(".")[0]
79 lower_keys = ".".join(key.split(".")[1:])
80 nested_config.setdefault(top_level_key, {})[lower_keys] = value
81 else:
82 # This is a low-level key.
83 nested_config[key] = value
85 # Expand all the lower-level keys.
86 for key, value in nested_config.items():
87 if isinstance(value, dict):
88 # Expand the child keys.
89 nested_config[key] = _extract_per_engine_config(value)
91 return nested_config
94def search_config(
95 username: Optional[str] = None,
96 db_session: Optional[Session] = None,
97 settings_snapshot: Optional[Dict[str, Any]] = None,
98) -> Dict[str, Any]:
99 """
100 Returns the search engine configuration loaded from the database or settings snapshot.
102 Args:
103 username: Username for backward compatibility (deprecated)
104 db_session: Database session for direct access (preferred for web routes)
105 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
107 Returns:
108 The search engine configuration loaded from the database or snapshot.
109 """
110 # Extract search engine definitions
111 config_data = _get_setting(
112 "search.engine.web",
113 {},
114 db_session=db_session,
115 settings_snapshot=settings_snapshot,
116 username=username,
117 )
119 search_engines = _extract_per_engine_config(config_data)
121 # Normalize legacy absolute module paths stored in user databases to
122 # the relative form required by the security whitelist.
123 _ABSOLUTE_PREFIX = "local_deep_research.web_search_engines"
124 for engine_data in search_engines.values():
125 if isinstance(engine_data, dict):
126 mp = engine_data.get("module_path")
127 if isinstance(mp, str) and mp.startswith(_ABSOLUTE_PREFIX):
128 engine_data["module_path"] = mp[len(_ABSOLUTE_PREFIX) :]
130 search_engines["auto"] = _get_setting(
131 "search.engine.auto",
132 {},
133 db_session=db_session,
134 settings_snapshot=settings_snapshot,
135 username=username,
136 )
138 # Add registered retrievers as available search engines
139 from .retriever_registry import retriever_registry
141 for name in retriever_registry.list_registered():
142 search_engines[name] = {
143 "module_path": ".engines.search_engine_retriever",
144 "class_name": "RetrieverSearchEngine",
145 "requires_api_key": False,
146 "requires_llm": False,
147 "description": f"LangChain retriever: {name}",
148 "strengths": [
149 "Domain-specific knowledge",
150 "No rate limits",
151 "Fast retrieval",
152 ],
153 "weaknesses": ["Limited to indexed content"],
154 "supports_full_search": True,
155 "is_retriever": True, # Mark as retriever for identification
156 }
158 logger.info(
159 f"Loaded {len(search_engines)} search engines from configuration file"
160 )
161 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n")
163 # Add alias for 'auto' if it exists
164 if "auto" in search_engines and "meta" not in search_engines: 164 ↛ 168line 164 didn't jump to line 168 because the condition on line 164 was always true
165 search_engines["meta"] = search_engines["auto"]
167 # Register local document collections
168 local_collections_data = (
169 _get_setting(
170 "search.engine.local",
171 {},
172 db_session=db_session,
173 settings_snapshot=settings_snapshot,
174 username=username,
175 )
176 or {}
177 )
178 local_collections_data = _extract_per_engine_config(local_collections_data)
180 for collection, config in local_collections_data.items():
181 if not config.get("enabled", True):
182 # Search engine is not enabled. Ignore.
183 logger.info(f"Ignoring disabled local collection '{collection}'.")
184 continue
186 if "paths" in config and isinstance(config["paths"], str):
187 # This will be saved as a json array.
188 try:
189 config["paths"] = json.loads(config["paths"])
190 except json.decoder.JSONDecodeError:
191 logger.exception(
192 f"Path for local collection '{collection}' is not a valid JSON array: "
193 f"{config['paths']}"
194 )
195 config["paths"] = []
197 # Create a new dictionary with required search engine fields
198 engine_config = {
199 "default_params": config,
200 "requires_llm": True,
201 }
202 engine_config_prefix = f"search.engine.local.{collection}"
203 engine_config["module_path"] = _get_setting(
204 f"{engine_config_prefix}.module_path",
205 ".engines.search_engine_local",
206 db_session=db_session,
207 settings_snapshot=settings_snapshot,
208 username=username,
209 )
210 engine_config["class_name"] = _get_setting(
211 f"{engine_config_prefix}.class_name",
212 "LocalSearchEngine",
213 db_session=db_session,
214 settings_snapshot=settings_snapshot,
215 username=username,
216 )
218 # Copy these specific fields to the top level if they exist
219 for field in ["strengths", "weaknesses", "reliability", "description"]:
220 if field in config:
221 engine_config[field] = config[field]
223 search_engines[collection] = engine_config
225 logger.info("Registered local document collections as search engines")
227 # Register Library RAG as a search engine
228 library_enabled = _get_setting(
229 "search.engine.library.enabled",
230 True,
231 db_session=db_session,
232 settings_snapshot=settings_snapshot,
233 username=username,
234 )
236 if library_enabled:
237 search_engines["library"] = {
238 "module_path": ".engines.search_engine_library",
239 "class_name": "LibraryRAGSearchEngine",
240 "requires_llm": True,
241 "display_name": "Search All Collections",
242 "default_params": {},
243 "description": "Search across all your document collections using semantic search",
244 "strengths": [
245 "Searches all your curated collections of research papers and documents",
246 "Uses semantic search for better relevance",
247 "Returns documents you've already saved and reviewed",
248 ],
249 "weaknesses": [
250 "Limited to documents already in your collections",
251 "Requires documents to be indexed first",
252 ],
253 "reliability": "High - searches all your collections",
254 }
255 logger.info("Registered Library RAG as search engine")
257 # Register document collections as individual search engines
258 if library_enabled:
259 try:
260 from ..database.models.library import Collection
261 from ..database.session_context import get_user_db_session
263 # Get username from settings_snapshot if available
264 collection_username = (
265 settings_snapshot.get("_username")
266 if settings_snapshot
267 else username
268 )
270 if collection_username:
271 with get_user_db_session(collection_username) as session:
272 collections = session.query(Collection).all()
274 for collection in collections:
275 engine_id = f"collection_{collection.id}"
276 # Add suffix to distinguish from the all-collections search
277 display_name = f"{collection.name} (Collection)"
278 search_engines[engine_id] = {
279 "module_path": ".engines.search_engine_collection",
280 "class_name": "CollectionSearchEngine",
281 "requires_llm": True,
282 "is_local": True,
283 "display_name": display_name,
284 "default_params": {
285 "collection_id": collection.id,
286 "collection_name": collection.name,
287 },
288 "description": (
289 collection.description
290 if collection.description
291 else f"Search documents in {collection.name} collection only"
292 ),
293 "strengths": [
294 f"Searches only documents in {collection.name}",
295 "Focused semantic search within specific topic area",
296 "Returns documents from a curated collection",
297 ],
298 "weaknesses": [
299 "Limited to documents in this collection",
300 "Smaller result pool than full library search",
301 ],
302 "reliability": "High - searches a specific collection",
303 }
305 logger.info(
306 f"Registered {len(collections)} document collections as search engines"
307 )
308 else:
309 logger.debug(
310 "No username available for collection registration"
311 )
312 except Exception as e:
313 logger.warning(f"Could not register document collections: {e}")
315 return search_engines
318def default_search_engine(
319 username: Optional[str] = None,
320 db_session: Optional[Session] = None,
321 settings_snapshot: Optional[Dict[str, Any]] = None,
322) -> str:
323 """
324 Returns the configured default search engine.
326 Args:
327 username: Username for backward compatibility (deprecated)
328 db_session: Database session for direct access (preferred for web routes)
329 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
331 Returns:
332 The configured default search engine.
333 """
334 return _get_setting(
335 "search.engine.DEFAULT_SEARCH_ENGINE",
336 "wikipedia",
337 db_session=db_session,
338 settings_snapshot=settings_snapshot,
339 username=username,
340 )
343def local_search_engines(
344 username: Optional[str] = None,
345 db_session: Optional[Session] = None,
346 settings_snapshot: Optional[Dict[str, Any]] = None,
347) -> List[str]:
348 """
349 Returns a list of the enabled local search engines.
351 Args:
352 username: Username for backward compatibility (deprecated)
353 db_session: Database session for direct access (preferred for web routes)
354 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
356 Returns:
357 A list of the enabled local search engines.
358 """
359 local_collections_data = (
360 _get_setting(
361 "search.engine.local",
362 {},
363 db_session=db_session,
364 settings_snapshot=settings_snapshot,
365 username=username,
366 )
367 or {}
368 )
369 local_collections_data = _extract_per_engine_config(local_collections_data)
371 # Don't include the `local_all` collection.
372 local_collections_data.pop("local_all", None)
373 # Remove disabled collections.
374 local_collections_data = {
375 k: v
376 for k, v in local_collections_data.items()
377 if v.get("enabled", True)
378 }
380 enabled_collections = list(local_collections_data.keys())
381 logger.debug(f"Using local collections: {enabled_collections}")
382 return enabled_collections