Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 79%
95 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Configuration file for search engines.
3Loads search engine definitions from the user's configuration.
4"""
6import json
7from typing import Any, Dict, List, Optional
8from sqlalchemy.orm import Session
10from loguru import logger
12from ..config.thread_settings import get_setting_from_snapshot
13from ..utilities.db_utils import get_settings_manager
16def _get_setting(
17 key: str,
18 default_value: Any = None,
19 db_session: Optional[Session] = None,
20 settings_snapshot: Optional[Dict[str, Any]] = None,
21 username: Optional[str] = None,
22) -> Any:
23 """
24 Get a setting from either a database session or settings snapshot.
26 Args:
27 key: The setting key
28 default_value: Default value if setting not found
29 db_session: Database session for direct access
30 settings_snapshot: Settings snapshot for thread context
31 username: Username for backward compatibility
33 Returns:
34 The setting value or default_value if not found
35 """
36 # Try settings snapshot first (thread context)
37 if settings_snapshot:
38 try:
39 return get_setting_from_snapshot(
40 key, default_value, settings_snapshot=settings_snapshot
41 )
42 except Exception as e:
43 logger.debug(f"Could not get setting {key} from snapshot: {e}")
45 # Try database session if available
46 if db_session:
47 try:
48 settings_manager = get_settings_manager(db_session, username)
49 return settings_manager.get_setting(key, default_value)
50 except Exception as e:
51 logger.debug(f"Could not get setting {key} from db_session: {e}")
53 # Return default if all methods fail
54 logger.warning(
55 f"Could not retrieve setting '{key}', returning default: {default_value}"
56 )
57 return default_value
60def _extract_per_engine_config(
61 raw_config: Dict[str, Any],
62) -> Dict[str, Dict[str, Any]]:
63 """
64 Converts the "flat" configuration loaded from the settings database into
65 individual settings dictionaries for each engine.
67 Args:
68 raw_config: The raw "flat" configuration.
70 Returns:
71 Configuration dictionaries indexed by engine name.
73 """
74 nested_config = {}
75 for key, value in raw_config.items():
76 if "." in key:
77 # This is a higher-level key.
78 top_level_key = key.split(".")[0]
79 lower_keys = ".".join(key.split(".")[1:])
80 nested_config.setdefault(top_level_key, {})[lower_keys] = value
81 else:
82 # This is a low-level key.
83 nested_config[key] = value
85 # Expand all the lower-level keys.
86 for key, value in nested_config.items():
87 if isinstance(value, dict):
88 # Expand the child keys.
89 nested_config[key] = _extract_per_engine_config(value)
91 return nested_config
94def search_config(
95 username: Optional[str] = None,
96 db_session: Optional[Session] = None,
97 settings_snapshot: Optional[Dict[str, Any]] = None,
98) -> Dict[str, Any]:
99 """
100 Returns the search engine configuration loaded from the database or settings snapshot.
102 Args:
103 username: Username for backward compatibility (deprecated)
104 db_session: Database session for direct access (preferred for web routes)
105 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
107 Returns:
108 The search engine configuration loaded from the database or snapshot.
109 """
110 # Extract search engine definitions
111 config_data = _get_setting(
112 "search.engine.web",
113 {},
114 db_session=db_session,
115 settings_snapshot=settings_snapshot,
116 username=username,
117 )
119 search_engines = _extract_per_engine_config(config_data)
120 search_engines["auto"] = _get_setting(
121 "search.engine.auto",
122 {},
123 db_session=db_session,
124 settings_snapshot=settings_snapshot,
125 username=username,
126 )
128 # Add registered retrievers as available search engines
129 from .retriever_registry import retriever_registry
131 for name in retriever_registry.list_registered():
132 search_engines[name] = {
133 "module_path": ".engines.search_engine_retriever",
134 "class_name": "RetrieverSearchEngine",
135 "requires_api_key": False,
136 "requires_llm": False,
137 "description": f"LangChain retriever: {name}",
138 "strengths": [
139 "Domain-specific knowledge",
140 "No rate limits",
141 "Fast retrieval",
142 ],
143 "weaknesses": ["Limited to indexed content"],
144 "supports_full_search": True,
145 "is_retriever": True, # Mark as retriever for identification
146 }
148 logger.info(
149 f"Loaded {len(search_engines)} search engines from configuration file"
150 )
151 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n")
153 # Add alias for 'auto' if it exists
154 if "auto" in search_engines and "meta" not in search_engines: 154 ↛ 158line 154 didn't jump to line 158 because the condition on line 154 was always true
155 search_engines["meta"] = search_engines["auto"]
157 # Register local document collections
158 local_collections_data = _get_setting(
159 "search.engine.local",
160 {},
161 db_session=db_session,
162 settings_snapshot=settings_snapshot,
163 username=username,
164 )
165 local_collections_data = _extract_per_engine_config(local_collections_data)
167 for collection, config in local_collections_data.items():
168 if not config.get("enabled", True): 168 ↛ 170line 168 didn't jump to line 170 because the condition on line 168 was never true
169 # Search engine is not enabled. Ignore.
170 logger.info(f"Ignoring disabled local collection '{collection}'.")
171 continue
173 if "paths" in config and isinstance(config["paths"], str): 173 ↛ 175line 173 didn't jump to line 175 because the condition on line 173 was never true
174 # This will be saved as a json array.
175 try:
176 config["paths"] = json.loads(config["paths"])
177 except json.decoder.JSONDecodeError:
178 logger.exception(
179 f"Path for local collection '{collection}' is not a valid JSON array: "
180 f"{config['paths']}"
181 )
182 config["paths"] = []
184 # Create a new dictionary with required search engine fields
185 engine_config = {
186 "default_params": config,
187 "requires_llm": True,
188 }
189 engine_config_prefix = f"search.engine.local.{collection}"
190 engine_config["module_path"] = _get_setting(
191 f"{engine_config_prefix}.module_path",
192 "local_deep_research.web_search_engines.engines.search_engine_local",
193 db_session=db_session,
194 settings_snapshot=settings_snapshot,
195 username=username,
196 )
197 engine_config["class_name"] = _get_setting(
198 f"{engine_config_prefix}.class_name",
199 "LocalSearchEngine",
200 db_session=db_session,
201 settings_snapshot=settings_snapshot,
202 username=username,
203 )
205 # Copy these specific fields to the top level if they exist
206 for field in ["strengths", "weaknesses", "reliability", "description"]:
207 if field in config: 207 ↛ 206line 207 didn't jump to line 206 because the condition on line 207 was always true
208 engine_config[field] = config[field]
210 search_engines[collection] = engine_config
212 logger.info("Registered local document collections as search engines")
214 # Register Library RAG as a search engine
215 library_enabled = _get_setting(
216 "search.engine.library.enabled",
217 True,
218 db_session=db_session,
219 settings_snapshot=settings_snapshot,
220 username=username,
221 )
223 if library_enabled: 223 ↛ 245line 223 didn't jump to line 245 because the condition on line 223 was always true
224 search_engines["library"] = {
225 "module_path": "local_deep_research.web_search_engines.engines.search_engine_library",
226 "class_name": "LibraryRAGSearchEngine",
227 "requires_llm": True,
228 "display_name": "Search All Collections",
229 "default_params": {},
230 "description": "Search across all your document collections using semantic search",
231 "strengths": [
232 "Searches all your curated collections of research papers and documents",
233 "Uses semantic search for better relevance",
234 "Returns documents you've already saved and reviewed",
235 ],
236 "weaknesses": [
237 "Limited to documents already in your collections",
238 "Requires documents to be indexed first",
239 ],
240 "reliability": "High - searches all your collections",
241 }
242 logger.info("Registered Library RAG as search engine")
244 # Register document collections as individual search engines
245 if library_enabled: 245 ↛ 302line 245 didn't jump to line 302 because the condition on line 245 was always true
246 try:
247 from ..database.models.library import Collection
248 from ..database.session_context import get_user_db_session
250 # Get username from settings_snapshot if available
251 collection_username = (
252 settings_snapshot.get("_username")
253 if settings_snapshot
254 else username
255 )
257 if collection_username:
258 with get_user_db_session(collection_username) as session:
259 collections = session.query(Collection).all()
261 for collection in collections:
262 engine_id = f"collection_{collection.id}"
263 # Add suffix to distinguish from the all-collections search
264 display_name = f"{collection.name} (Collection)"
265 search_engines[engine_id] = {
266 "module_path": "local_deep_research.web_search_engines.engines.search_engine_collection",
267 "class_name": "CollectionSearchEngine",
268 "requires_llm": True,
269 "is_local": True,
270 "display_name": display_name,
271 "default_params": {
272 "collection_id": collection.id,
273 "collection_name": collection.name,
274 },
275 "description": (
276 collection.description
277 if collection.description
278 else f"Search documents in {collection.name} collection only"
279 ),
280 "strengths": [
281 f"Searches only documents in {collection.name}",
282 "Focused semantic search within specific topic area",
283 "Returns documents from a curated collection",
284 ],
285 "weaknesses": [
286 "Limited to documents in this collection",
287 "Smaller result pool than full library search",
288 ],
289 "reliability": "High - searches a specific collection",
290 }
292 logger.info(
293 f"Registered {len(collections)} document collections as search engines"
294 )
295 else:
296 logger.debug(
297 "No username available for collection registration"
298 )
299 except Exception as e:
300 logger.warning(f"Could not register document collections: {e}")
302 return search_engines
305def default_search_engine(
306 username: Optional[str] = None,
307 db_session: Optional[Session] = None,
308 settings_snapshot: Optional[Dict[str, Any]] = None,
309) -> str:
310 """
311 Returns the configured default search engine.
313 Args:
314 username: Username for backward compatibility (deprecated)
315 db_session: Database session for direct access (preferred for web routes)
316 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
318 Returns:
319 The configured default search engine.
320 """
321 return _get_setting(
322 "search.engine.DEFAULT_SEARCH_ENGINE",
323 "wikipedia",
324 db_session=db_session,
325 settings_snapshot=settings_snapshot,
326 username=username,
327 )
330def local_search_engines(
331 username: Optional[str] = None,
332 db_session: Optional[Session] = None,
333 settings_snapshot: Optional[Dict[str, Any]] = None,
334) -> List[str]:
335 """
336 Returns a list of the enabled local search engines.
338 Args:
339 username: Username for backward compatibility (deprecated)
340 db_session: Database session for direct access (preferred for web routes)
341 settings_snapshot: Settings snapshot for thread context (preferred for background threads)
343 Returns:
344 A list of the enabled local search engines.
345 """
346 local_collections_data = _get_setting(
347 "search.engine.local",
348 {},
349 db_session=db_session,
350 settings_snapshot=settings_snapshot,
351 username=username,
352 )
353 local_collections_data = _extract_per_engine_config(local_collections_data)
355 # Don't include the `local_all` collection.
356 local_collections_data.pop("local_all", None)
357 # Remove disabled collections.
358 local_collections_data = {
359 k: v
360 for k, v in local_collections_data.items()
361 if v.get("enabled", True)
362 }
364 enabled_collections = list(local_collections_data.keys())
365 logger.debug(f"Using local collections: {enabled_collections}")
366 return enabled_collections