Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 99%

113 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Configuration file for search engines. 

3Loads search engine definitions from the user's configuration. 

4""" 

5 

6from typing import Any, Dict, Optional 

7from sqlalchemy.orm import Session 

8 

9from loguru import logger 

10 

11from ..config.thread_settings import get_setting_from_snapshot 

12from ..utilities.db_utils import get_settings_manager 

13 

14 

15def _get_setting( 

16 key: str, 

17 default_value: Any = None, 

18 db_session: Optional[Session] = None, 

19 settings_snapshot: Optional[Dict[str, Any]] = None, 

20 username: Optional[str] = None, 

21) -> Any: 

22 """ 

23 Get a setting from either a database session or settings snapshot. 

24 

25 Args: 

26 key: The setting key 

27 default_value: Default value if setting not found 

28 db_session: Database session for direct access 

29 settings_snapshot: Settings snapshot for thread context 

30 username: Username for backward compatibility 

31 

32 Returns: 

33 The setting value or default_value if not found 

34 """ 

35 # Try settings snapshot first (thread context) 

36 if settings_snapshot: 

37 try: 

38 return get_setting_from_snapshot( 

39 key, default_value, settings_snapshot=settings_snapshot 

40 ) 

41 except Exception as e: 

42 logger.debug(f"Could not get setting {key} from snapshot: {e}") 

43 

44 # Try database session if available 

45 if db_session: 

46 try: 

47 settings_manager = get_settings_manager(db_session, username) 

48 return settings_manager.get_setting(key, default_value) 

49 except Exception as e: 

50 logger.debug(f"Could not get setting {key} from db_session: {e}") 

51 

52 # Return default if all methods fail 

53 logger.warning( 

54 f"Could not retrieve setting '{key}', returning default: {default_value}" 

55 ) 

56 return default_value 

57 

58 

59def _extract_per_engine_config( 

60 raw_config: Dict[str, Any], 

61) -> Dict[str, Dict[str, Any]]: 

62 """ 

63 Converts the "flat" configuration loaded from the settings database into 

64 individual settings dictionaries for each engine. 

65 

66 Args: 

67 raw_config: The raw "flat" configuration. 

68 

69 Returns: 

70 Configuration dictionaries indexed by engine name. 

71 

72 """ 

73 nested_config: dict[str, Any] = {} 

74 for key, value in raw_config.items(): 

75 if "." in key: 

76 # This is a higher-level key. 

77 top_level_key = key.split(".")[0] 

78 lower_keys = ".".join(key.split(".")[1:]) 

79 nested_config.setdefault(top_level_key, {})[lower_keys] = value 

80 else: 

81 # This is a low-level key. 

82 nested_config[key] = value 

83 

84 # Expand all the lower-level keys. 

85 for key, value in nested_config.items(): 

86 if isinstance(value, dict): 

87 # Expand the child keys. 

88 nested_config[key] = _extract_per_engine_config(value) 

89 

90 return nested_config 

91 

92 

93def search_config( 

94 username: Optional[str] = None, 

95 db_session: Optional[Session] = None, 

96 settings_snapshot: Optional[Dict[str, Any]] = None, 

97) -> Dict[str, Any]: 

98 """ 

99 Returns the search engine configuration loaded from the database or settings snapshot. 

100 

101 Args: 

102 username: Username for backward compatibility (deprecated) 

103 db_session: Database session for direct access (preferred for web routes) 

104 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

105 

106 Returns: 

107 The search engine configuration loaded from the database or snapshot. 

108 """ 

109 # Extract search engine definitions 

110 config_data = _get_setting( 

111 "search.engine.web", 

112 {}, 

113 db_session=db_session, 

114 settings_snapshot=settings_snapshot, 

115 username=username, 

116 ) 

117 

118 search_engines = _extract_per_engine_config(config_data) 

119 

120 search_engines["auto"] = _get_setting( 

121 "search.engine.auto", 

122 {}, 

123 db_session=db_session, 

124 settings_snapshot=settings_snapshot, 

125 username=username, 

126 ) 

127 

128 # Inject module/class from the hardcoded engine registry. 

129 # This is the single source of truth for which Python module implements 

130 # each engine — these values are never read from the settings DB. 

131 from .engine_registry import ENGINE_REGISTRY 

132 

133 for name, entry in ENGINE_REGISTRY.items(): 

134 if name in search_engines: 

135 search_engines[name]["module_path"] = entry.module_path 

136 search_engines[name]["class_name"] = entry.class_name 

137 if entry.full_search_module: 

138 search_engines[name]["full_search_module"] = ( 

139 entry.full_search_module 

140 ) 

141 search_engines[name]["full_search_class"] = ( 

142 entry.full_search_class 

143 ) 

144 

145 # Add registered retrievers as available search engines 

146 from .retriever_registry import retriever_registry 

147 

148 for name in retriever_registry.list_registered(): 

149 search_engines[name] = { 

150 "module_path": ".engines.search_engine_retriever", 

151 "class_name": "RetrieverSearchEngine", 

152 "requires_api_key": False, 

153 "requires_llm": False, 

154 "description": f"LangChain retriever: {name}", 

155 "strengths": [ 

156 "Domain-specific knowledge", 

157 "No rate limits", 

158 "Fast retrieval", 

159 ], 

160 "weaknesses": ["Limited to indexed content"], 

161 "supports_full_search": True, 

162 "is_retriever": True, # Mark as retriever for identification 

163 } 

164 

165 logger.info( 

166 f"Loaded {len(search_engines)} search engines from configuration file" 

167 ) 

168 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n") 

169 

170 # Add alias for 'auto' if it exists 

171 if "auto" in search_engines and "meta" not in search_engines: 

172 search_engines["meta"] = search_engines["auto"] 

173 

174 # Register Library RAG as a search engine 

175 library_enabled = _get_setting( 

176 "search.engine.library.enabled", 

177 True, 

178 db_session=db_session, 

179 settings_snapshot=settings_snapshot, 

180 username=username, 

181 ) 

182 

183 if library_enabled: 

184 search_engines["library"] = { 

185 "module_path": ".engines.search_engine_library", 

186 "class_name": "LibraryRAGSearchEngine", 

187 "requires_llm": True, 

188 "display_name": "Search All Collections", 

189 "default_params": {}, 

190 "description": "Search across all your document collections using semantic search", 

191 "strengths": [ 

192 "Searches all your curated collections of research papers and documents", 

193 "Uses semantic search for better relevance", 

194 "Returns documents you've already saved and reviewed", 

195 ], 

196 "weaknesses": [ 

197 "Limited to documents already in your collections", 

198 "Requires documents to be indexed first", 

199 ], 

200 "reliability": "High - searches all your collections", 

201 } 

202 logger.info("Registered Library RAG as search engine") 

203 

204 # Register document collections as individual search engines 

205 if library_enabled: 

206 try: 

207 from ..database.models.library import Collection 

208 from ..database.session_context import get_user_db_session 

209 

210 # Get username from settings_snapshot if available 

211 collection_username = ( 

212 settings_snapshot.get("_username") 

213 if settings_snapshot 

214 else username 

215 ) 

216 

217 if collection_username: 

218 with get_user_db_session(collection_username) as session: 

219 collections = session.query(Collection).all() 

220 

221 for collection in collections: 

222 engine_id = f"collection_{collection.id}" 

223 # Add suffix to distinguish from the all-collections search 

224 display_name = f"{collection.name} (Collection)" 

225 search_engines[engine_id] = { 

226 "module_path": ".engines.search_engine_collection", 

227 "class_name": "CollectionSearchEngine", 

228 "requires_llm": True, 

229 "is_local": True, 

230 "display_name": display_name, 

231 "default_params": { 

232 "collection_id": collection.id, 

233 "collection_name": collection.name, 

234 }, 

235 "description": ( 

236 collection.description 

237 if collection.description 

238 else f"Search documents in {collection.name} collection only" 

239 ), 

240 "strengths": [ 

241 f"Searches only documents in {collection.name}", 

242 "Focused semantic search within specific topic area", 

243 "Returns documents from a curated collection", 

244 ], 

245 "weaknesses": [ 

246 "Limited to documents in this collection", 

247 "Smaller result pool than full library search", 

248 ], 

249 "reliability": "High - searches a specific collection", 

250 } 

251 

252 logger.info( 

253 f"Registered {len(collections)} document collections as search engines" 

254 ) 

255 else: 

256 logger.debug( 

257 "No username available for collection registration" 

258 ) 

259 except Exception: 

260 logger.warning("Could not register document collections") 

261 

262 return search_engines 

263 

264 

265def get_available_engines( 

266 settings_snapshot: Optional[Dict[str, Any]] = None, 

267 use_api_key_services: bool = True, 

268 exclude_engines: Optional[set] = None, 

269) -> Dict[str, Any]: 

270 """ 

271 Return search engines that are actually usable: enabled for auto-search 

272 and with valid API keys when required. 

273 

274 This is the single shared filter used by MetaSearchEngine, 

275 ParallelSearchEngine, and MCPSearchStrategy so they all agree on which 

276 engines are available. 

277 

278 Args: 

279 settings_snapshot: Thread-safe settings snapshot. 

280 use_api_key_services: If False, engines that require an API key are 

281 excluded even when the key is present. 

282 exclude_engines: Additional engine names to skip (e.g. the caller's 

283 own name). 

284 

285 Returns: 

286 Dict of engine_name → config for engines that passed all checks. 

287 """ 

288 if not settings_snapshot: 

289 logger.warning( 

290 "get_available_engines called without settings_snapshot, " 

291 "returning empty dict" 

292 ) 

293 return {} 

294 

295 all_engines = search_config(settings_snapshot=settings_snapshot) 

296 excluded = {"meta", "auto", "parallel"} 

297 if exclude_engines: 297 ↛ 298line 297 didn't jump to line 298 because the condition on line 297 was never true

298 excluded |= exclude_engines 

299 

300 available: Dict[str, Any] = {} 

301 

302 for name, config in all_engines.items(): 

303 if name in excluded: 

304 continue 

305 

306 # Check use_in_auto_search setting (default False) 

307 auto_search_key = f"search.engine.web.{name}.use_in_auto_search" 

308 use_in_auto = get_setting_from_snapshot( 

309 auto_search_key, False, settings_snapshot=settings_snapshot 

310 ) 

311 if not use_in_auto: 

312 continue 

313 

314 requires_key = config.get("requires_api_key", False) 

315 

316 # Honour the use_api_key_services flag 

317 if requires_key and not use_api_key_services: 

318 continue 

319 

320 # Validate the API key is actually present 

321 if requires_key: 

322 api_key = _resolve_api_key(name, config, settings_snapshot) 

323 if not api_key: 

324 logger.debug( 

325 f"Skipping {name} — requires API key but none configured" 

326 ) 

327 continue 

328 

329 available[name] = config 

330 

331 return available 

332 

333 

334def _resolve_api_key( 

335 engine_name: str, 

336 engine_config: Dict[str, Any], 

337 settings_snapshot: Dict[str, Any], 

338) -> Optional[str]: 

339 """ 

340 Try to find a valid API key for *engine_name*. 

341 

342 Resolution order (mirrors ``create_search_engine``): 

343 1. ``search.engine.web.<name>.api_key`` in the snapshot 

344 2. ``api_key`` inside the engine config dict 

345 

346 Returns the key string or None. 

347 """ 

348 api_key = None 

349 api_key_path = f"search.engine.web.{engine_name}.api_key" 

350 

351 api_key_setting = settings_snapshot.get(api_key_path) 

352 if api_key_setting: 

353 api_key = ( 

354 api_key_setting.get("value") 

355 if isinstance(api_key_setting, dict) 

356 else api_key_setting 

357 ) 

358 

359 if not api_key: 

360 api_key = engine_config.get("api_key") 

361 

362 if not api_key: 

363 return None 

364 

365 # Reject common placeholder values 

366 api_key_str = str(api_key).strip() 

367 if ( 

368 not api_key_str 

369 or api_key_str in ("None", "PLACEHOLDER", "YOUR_API_KEY_HERE", "null") 

370 or api_key_str.endswith("_API_KEY") 

371 or api_key_str.startswith("YOUR_") 

372 ): 

373 return None 

374 

375 return api_key_str 

376 

377 

378def default_search_engine( 

379 username: Optional[str] = None, 

380 db_session: Optional[Session] = None, 

381 settings_snapshot: Optional[Dict[str, Any]] = None, 

382) -> str: 

383 """ 

384 Returns the configured default search engine. 

385 

386 Args: 

387 username: Username for backward compatibility (deprecated) 

388 db_session: Database session for direct access (preferred for web routes) 

389 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

390 

391 Returns: 

392 The configured default search engine. 

393 """ 

394 return str( 

395 _get_setting( 

396 "search.engine.DEFAULT_SEARCH_ENGINE", 

397 "wikipedia", 

398 db_session=db_session, 

399 settings_snapshot=settings_snapshot, 

400 username=username, 

401 ) 

402 )