Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 98%

101 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Configuration file for search engines. 

3Loads search engine definitions from the user's configuration. 

4""" 

5 

6import json 

7from typing import Any, Dict, List, Optional 

8from sqlalchemy.orm import Session 

9 

10from loguru import logger 

11 

12from ..config.thread_settings import get_setting_from_snapshot 

13from ..utilities.db_utils import get_settings_manager 

14 

15 

16def _get_setting( 

17 key: str, 

18 default_value: Any = None, 

19 db_session: Optional[Session] = None, 

20 settings_snapshot: Optional[Dict[str, Any]] = None, 

21 username: Optional[str] = None, 

22) -> Any: 

23 """ 

24 Get a setting from either a database session or settings snapshot. 

25 

26 Args: 

27 key: The setting key 

28 default_value: Default value if setting not found 

29 db_session: Database session for direct access 

30 settings_snapshot: Settings snapshot for thread context 

31 username: Username for backward compatibility 

32 

33 Returns: 

34 The setting value or default_value if not found 

35 """ 

36 # Try settings snapshot first (thread context) 

37 if settings_snapshot: 

38 try: 

39 return get_setting_from_snapshot( 

40 key, default_value, settings_snapshot=settings_snapshot 

41 ) 

42 except Exception as e: 

43 logger.debug(f"Could not get setting {key} from snapshot: {e}") 

44 

45 # Try database session if available 

46 if db_session: 

47 try: 

48 settings_manager = get_settings_manager(db_session, username) 

49 return settings_manager.get_setting(key, default_value) 

50 except Exception as e: 

51 logger.debug(f"Could not get setting {key} from db_session: {e}") 

52 

53 # Return default if all methods fail 

54 logger.warning( 

55 f"Could not retrieve setting '{key}', returning default: {default_value}" 

56 ) 

57 return default_value 

58 

59 

60def _extract_per_engine_config( 

61 raw_config: Dict[str, Any], 

62) -> Dict[str, Dict[str, Any]]: 

63 """ 

64 Converts the "flat" configuration loaded from the settings database into 

65 individual settings dictionaries for each engine. 

66 

67 Args: 

68 raw_config: The raw "flat" configuration. 

69 

70 Returns: 

71 Configuration dictionaries indexed by engine name. 

72 

73 """ 

74 nested_config = {} 

75 for key, value in raw_config.items(): 

76 if "." in key: 

77 # This is a higher-level key. 

78 top_level_key = key.split(".")[0] 

79 lower_keys = ".".join(key.split(".")[1:]) 

80 nested_config.setdefault(top_level_key, {})[lower_keys] = value 

81 else: 

82 # This is a low-level key. 

83 nested_config[key] = value 

84 

85 # Expand all the lower-level keys. 

86 for key, value in nested_config.items(): 

87 if isinstance(value, dict): 

88 # Expand the child keys. 

89 nested_config[key] = _extract_per_engine_config(value) 

90 

91 return nested_config 

92 

93 

94def search_config( 

95 username: Optional[str] = None, 

96 db_session: Optional[Session] = None, 

97 settings_snapshot: Optional[Dict[str, Any]] = None, 

98) -> Dict[str, Any]: 

99 """ 

100 Returns the search engine configuration loaded from the database or settings snapshot. 

101 

102 Args: 

103 username: Username for backward compatibility (deprecated) 

104 db_session: Database session for direct access (preferred for web routes) 

105 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

106 

107 Returns: 

108 The search engine configuration loaded from the database or snapshot. 

109 """ 

110 # Extract search engine definitions 

111 config_data = _get_setting( 

112 "search.engine.web", 

113 {}, 

114 db_session=db_session, 

115 settings_snapshot=settings_snapshot, 

116 username=username, 

117 ) 

118 

119 search_engines = _extract_per_engine_config(config_data) 

120 

121 # Normalize legacy absolute module paths stored in user databases to 

122 # the relative form required by the security whitelist. 

123 _ABSOLUTE_PREFIX = "local_deep_research.web_search_engines" 

124 for engine_data in search_engines.values(): 

125 if isinstance(engine_data, dict): 

126 mp = engine_data.get("module_path") 

127 if isinstance(mp, str) and mp.startswith(_ABSOLUTE_PREFIX): 

128 engine_data["module_path"] = mp[len(_ABSOLUTE_PREFIX) :] 

129 

130 search_engines["auto"] = _get_setting( 

131 "search.engine.auto", 

132 {}, 

133 db_session=db_session, 

134 settings_snapshot=settings_snapshot, 

135 username=username, 

136 ) 

137 

138 # Add registered retrievers as available search engines 

139 from .retriever_registry import retriever_registry 

140 

141 for name in retriever_registry.list_registered(): 

142 search_engines[name] = { 

143 "module_path": ".engines.search_engine_retriever", 

144 "class_name": "RetrieverSearchEngine", 

145 "requires_api_key": False, 

146 "requires_llm": False, 

147 "description": f"LangChain retriever: {name}", 

148 "strengths": [ 

149 "Domain-specific knowledge", 

150 "No rate limits", 

151 "Fast retrieval", 

152 ], 

153 "weaknesses": ["Limited to indexed content"], 

154 "supports_full_search": True, 

155 "is_retriever": True, # Mark as retriever for identification 

156 } 

157 

158 logger.info( 

159 f"Loaded {len(search_engines)} search engines from configuration file" 

160 ) 

161 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n") 

162 

163 # Add alias for 'auto' if it exists 

164 if "auto" in search_engines and "meta" not in search_engines: 164 ↛ 168line 164 didn't jump to line 168 because the condition on line 164 was always true

165 search_engines["meta"] = search_engines["auto"] 

166 

167 # Register local document collections 

168 local_collections_data = ( 

169 _get_setting( 

170 "search.engine.local", 

171 {}, 

172 db_session=db_session, 

173 settings_snapshot=settings_snapshot, 

174 username=username, 

175 ) 

176 or {} 

177 ) 

178 local_collections_data = _extract_per_engine_config(local_collections_data) 

179 

180 for collection, config in local_collections_data.items(): 

181 if not config.get("enabled", True): 

182 # Search engine is not enabled. Ignore. 

183 logger.info(f"Ignoring disabled local collection '{collection}'.") 

184 continue 

185 

186 if "paths" in config and isinstance(config["paths"], str): 

187 # This will be saved as a json array. 

188 try: 

189 config["paths"] = json.loads(config["paths"]) 

190 except json.decoder.JSONDecodeError: 

191 logger.exception( 

192 f"Path for local collection '{collection}' is not a valid JSON array: " 

193 f"{config['paths']}" 

194 ) 

195 config["paths"] = [] 

196 

197 # Create a new dictionary with required search engine fields 

198 engine_config = { 

199 "default_params": config, 

200 "requires_llm": True, 

201 } 

202 engine_config_prefix = f"search.engine.local.{collection}" 

203 engine_config["module_path"] = _get_setting( 

204 f"{engine_config_prefix}.module_path", 

205 ".engines.search_engine_local", 

206 db_session=db_session, 

207 settings_snapshot=settings_snapshot, 

208 username=username, 

209 ) 

210 engine_config["class_name"] = _get_setting( 

211 f"{engine_config_prefix}.class_name", 

212 "LocalSearchEngine", 

213 db_session=db_session, 

214 settings_snapshot=settings_snapshot, 

215 username=username, 

216 ) 

217 

218 # Copy these specific fields to the top level if they exist 

219 for field in ["strengths", "weaknesses", "reliability", "description"]: 

220 if field in config: 

221 engine_config[field] = config[field] 

222 

223 search_engines[collection] = engine_config 

224 

225 logger.info("Registered local document collections as search engines") 

226 

227 # Register Library RAG as a search engine 

228 library_enabled = _get_setting( 

229 "search.engine.library.enabled", 

230 True, 

231 db_session=db_session, 

232 settings_snapshot=settings_snapshot, 

233 username=username, 

234 ) 

235 

236 if library_enabled: 

237 search_engines["library"] = { 

238 "module_path": ".engines.search_engine_library", 

239 "class_name": "LibraryRAGSearchEngine", 

240 "requires_llm": True, 

241 "display_name": "Search All Collections", 

242 "default_params": {}, 

243 "description": "Search across all your document collections using semantic search", 

244 "strengths": [ 

245 "Searches all your curated collections of research papers and documents", 

246 "Uses semantic search for better relevance", 

247 "Returns documents you've already saved and reviewed", 

248 ], 

249 "weaknesses": [ 

250 "Limited to documents already in your collections", 

251 "Requires documents to be indexed first", 

252 ], 

253 "reliability": "High - searches all your collections", 

254 } 

255 logger.info("Registered Library RAG as search engine") 

256 

257 # Register document collections as individual search engines 

258 if library_enabled: 

259 try: 

260 from ..database.models.library import Collection 

261 from ..database.session_context import get_user_db_session 

262 

263 # Get username from settings_snapshot if available 

264 collection_username = ( 

265 settings_snapshot.get("_username") 

266 if settings_snapshot 

267 else username 

268 ) 

269 

270 if collection_username: 

271 with get_user_db_session(collection_username) as session: 

272 collections = session.query(Collection).all() 

273 

274 for collection in collections: 

275 engine_id = f"collection_{collection.id}" 

276 # Add suffix to distinguish from the all-collections search 

277 display_name = f"{collection.name} (Collection)" 

278 search_engines[engine_id] = { 

279 "module_path": ".engines.search_engine_collection", 

280 "class_name": "CollectionSearchEngine", 

281 "requires_llm": True, 

282 "is_local": True, 

283 "display_name": display_name, 

284 "default_params": { 

285 "collection_id": collection.id, 

286 "collection_name": collection.name, 

287 }, 

288 "description": ( 

289 collection.description 

290 if collection.description 

291 else f"Search documents in {collection.name} collection only" 

292 ), 

293 "strengths": [ 

294 f"Searches only documents in {collection.name}", 

295 "Focused semantic search within specific topic area", 

296 "Returns documents from a curated collection", 

297 ], 

298 "weaknesses": [ 

299 "Limited to documents in this collection", 

300 "Smaller result pool than full library search", 

301 ], 

302 "reliability": "High - searches a specific collection", 

303 } 

304 

305 logger.info( 

306 f"Registered {len(collections)} document collections as search engines" 

307 ) 

308 else: 

309 logger.debug( 

310 "No username available for collection registration" 

311 ) 

312 except Exception as e: 

313 logger.warning(f"Could not register document collections: {e}") 

314 

315 return search_engines 

316 

317 

318def default_search_engine( 

319 username: Optional[str] = None, 

320 db_session: Optional[Session] = None, 

321 settings_snapshot: Optional[Dict[str, Any]] = None, 

322) -> str: 

323 """ 

324 Returns the configured default search engine. 

325 

326 Args: 

327 username: Username for backward compatibility (deprecated) 

328 db_session: Database session for direct access (preferred for web routes) 

329 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

330 

331 Returns: 

332 The configured default search engine. 

333 """ 

334 return _get_setting( 

335 "search.engine.DEFAULT_SEARCH_ENGINE", 

336 "wikipedia", 

337 db_session=db_session, 

338 settings_snapshot=settings_snapshot, 

339 username=username, 

340 ) 

341 

342 

343def local_search_engines( 

344 username: Optional[str] = None, 

345 db_session: Optional[Session] = None, 

346 settings_snapshot: Optional[Dict[str, Any]] = None, 

347) -> List[str]: 

348 """ 

349 Returns a list of the enabled local search engines. 

350 

351 Args: 

352 username: Username for backward compatibility (deprecated) 

353 db_session: Database session for direct access (preferred for web routes) 

354 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

355 

356 Returns: 

357 A list of the enabled local search engines. 

358 """ 

359 local_collections_data = ( 

360 _get_setting( 

361 "search.engine.local", 

362 {}, 

363 db_session=db_session, 

364 settings_snapshot=settings_snapshot, 

365 username=username, 

366 ) 

367 or {} 

368 ) 

369 local_collections_data = _extract_per_engine_config(local_collections_data) 

370 

371 # Don't include the `local_all` collection. 

372 local_collections_data.pop("local_all", None) 

373 # Remove disabled collections. 

374 local_collections_data = { 

375 k: v 

376 for k, v in local_collections_data.items() 

377 if v.get("enabled", True) 

378 } 

379 

380 enabled_collections = list(local_collections_data.keys()) 

381 logger.debug(f"Using local collections: {enabled_collections}") 

382 return enabled_collections