Coverage for src / local_deep_research / web_search_engines / search_engines_config.py: 79%

95 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Configuration file for search engines. 

3Loads search engine definitions from the user's configuration. 

4""" 

5 

6import json 

7from typing import Any, Dict, List, Optional 

8from sqlalchemy.orm import Session 

9 

10from loguru import logger 

11 

12from ..config.thread_settings import get_setting_from_snapshot 

13from ..utilities.db_utils import get_settings_manager 

14 

15 

16def _get_setting( 

17 key: str, 

18 default_value: Any = None, 

19 db_session: Optional[Session] = None, 

20 settings_snapshot: Optional[Dict[str, Any]] = None, 

21 username: Optional[str] = None, 

22) -> Any: 

23 """ 

24 Get a setting from either a database session or settings snapshot. 

25 

26 Args: 

27 key: The setting key 

28 default_value: Default value if setting not found 

29 db_session: Database session for direct access 

30 settings_snapshot: Settings snapshot for thread context 

31 username: Username for backward compatibility 

32 

33 Returns: 

34 The setting value or default_value if not found 

35 """ 

36 # Try settings snapshot first (thread context) 

37 if settings_snapshot: 

38 try: 

39 return get_setting_from_snapshot( 

40 key, default_value, settings_snapshot=settings_snapshot 

41 ) 

42 except Exception as e: 

43 logger.debug(f"Could not get setting {key} from snapshot: {e}") 

44 

45 # Try database session if available 

46 if db_session: 

47 try: 

48 settings_manager = get_settings_manager(db_session, username) 

49 return settings_manager.get_setting(key, default_value) 

50 except Exception as e: 

51 logger.debug(f"Could not get setting {key} from db_session: {e}") 

52 

53 # Return default if all methods fail 

54 logger.warning( 

55 f"Could not retrieve setting '{key}', returning default: {default_value}" 

56 ) 

57 return default_value 

58 

59 

60def _extract_per_engine_config( 

61 raw_config: Dict[str, Any], 

62) -> Dict[str, Dict[str, Any]]: 

63 """ 

64 Converts the "flat" configuration loaded from the settings database into 

65 individual settings dictionaries for each engine. 

66 

67 Args: 

68 raw_config: The raw "flat" configuration. 

69 

70 Returns: 

71 Configuration dictionaries indexed by engine name. 

72 

73 """ 

74 nested_config = {} 

75 for key, value in raw_config.items(): 

76 if "." in key: 

77 # This is a higher-level key. 

78 top_level_key = key.split(".")[0] 

79 lower_keys = ".".join(key.split(".")[1:]) 

80 nested_config.setdefault(top_level_key, {})[lower_keys] = value 

81 else: 

82 # This is a low-level key. 

83 nested_config[key] = value 

84 

85 # Expand all the lower-level keys. 

86 for key, value in nested_config.items(): 

87 if isinstance(value, dict): 

88 # Expand the child keys. 

89 nested_config[key] = _extract_per_engine_config(value) 

90 

91 return nested_config 

92 

93 

94def search_config( 

95 username: Optional[str] = None, 

96 db_session: Optional[Session] = None, 

97 settings_snapshot: Optional[Dict[str, Any]] = None, 

98) -> Dict[str, Any]: 

99 """ 

100 Returns the search engine configuration loaded from the database or settings snapshot. 

101 

102 Args: 

103 username: Username for backward compatibility (deprecated) 

104 db_session: Database session for direct access (preferred for web routes) 

105 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

106 

107 Returns: 

108 The search engine configuration loaded from the database or snapshot. 

109 """ 

110 # Extract search engine definitions 

111 config_data = _get_setting( 

112 "search.engine.web", 

113 {}, 

114 db_session=db_session, 

115 settings_snapshot=settings_snapshot, 

116 username=username, 

117 ) 

118 

119 search_engines = _extract_per_engine_config(config_data) 

120 search_engines["auto"] = _get_setting( 

121 "search.engine.auto", 

122 {}, 

123 db_session=db_session, 

124 settings_snapshot=settings_snapshot, 

125 username=username, 

126 ) 

127 

128 # Add registered retrievers as available search engines 

129 from .retriever_registry import retriever_registry 

130 

131 for name in retriever_registry.list_registered(): 

132 search_engines[name] = { 

133 "module_path": ".engines.search_engine_retriever", 

134 "class_name": "RetrieverSearchEngine", 

135 "requires_api_key": False, 

136 "requires_llm": False, 

137 "description": f"LangChain retriever: {name}", 

138 "strengths": [ 

139 "Domain-specific knowledge", 

140 "No rate limits", 

141 "Fast retrieval", 

142 ], 

143 "weaknesses": ["Limited to indexed content"], 

144 "supports_full_search": True, 

145 "is_retriever": True, # Mark as retriever for identification 

146 } 

147 

148 logger.info( 

149 f"Loaded {len(search_engines)} search engines from configuration file" 

150 ) 

151 logger.info(f"\n {', '.join(sorted(search_engines.keys()))} \n") 

152 

153 # Add alias for 'auto' if it exists 

154 if "auto" in search_engines and "meta" not in search_engines: 154 ↛ 158line 154 didn't jump to line 158 because the condition on line 154 was always true

155 search_engines["meta"] = search_engines["auto"] 

156 

157 # Register local document collections 

158 local_collections_data = _get_setting( 

159 "search.engine.local", 

160 {}, 

161 db_session=db_session, 

162 settings_snapshot=settings_snapshot, 

163 username=username, 

164 ) 

165 local_collections_data = _extract_per_engine_config(local_collections_data) 

166 

167 for collection, config in local_collections_data.items(): 

168 if not config.get("enabled", True): 168 ↛ 170line 168 didn't jump to line 170 because the condition on line 168 was never true

169 # Search engine is not enabled. Ignore. 

170 logger.info(f"Ignoring disabled local collection '{collection}'.") 

171 continue 

172 

173 if "paths" in config and isinstance(config["paths"], str): 173 ↛ 175line 173 didn't jump to line 175 because the condition on line 173 was never true

174 # This will be saved as a json array. 

175 try: 

176 config["paths"] = json.loads(config["paths"]) 

177 except json.decoder.JSONDecodeError: 

178 logger.exception( 

179 f"Path for local collection '{collection}' is not a valid JSON array: " 

180 f"{config['paths']}" 

181 ) 

182 config["paths"] = [] 

183 

184 # Create a new dictionary with required search engine fields 

185 engine_config = { 

186 "default_params": config, 

187 "requires_llm": True, 

188 } 

189 engine_config_prefix = f"search.engine.local.{collection}" 

190 engine_config["module_path"] = _get_setting( 

191 f"{engine_config_prefix}.module_path", 

192 "local_deep_research.web_search_engines.engines.search_engine_local", 

193 db_session=db_session, 

194 settings_snapshot=settings_snapshot, 

195 username=username, 

196 ) 

197 engine_config["class_name"] = _get_setting( 

198 f"{engine_config_prefix}.class_name", 

199 "LocalSearchEngine", 

200 db_session=db_session, 

201 settings_snapshot=settings_snapshot, 

202 username=username, 

203 ) 

204 

205 # Copy these specific fields to the top level if they exist 

206 for field in ["strengths", "weaknesses", "reliability", "description"]: 

207 if field in config: 207 ↛ 206line 207 didn't jump to line 206 because the condition on line 207 was always true

208 engine_config[field] = config[field] 

209 

210 search_engines[collection] = engine_config 

211 

212 logger.info("Registered local document collections as search engines") 

213 

214 # Register Library RAG as a search engine 

215 library_enabled = _get_setting( 

216 "search.engine.library.enabled", 

217 True, 

218 db_session=db_session, 

219 settings_snapshot=settings_snapshot, 

220 username=username, 

221 ) 

222 

223 if library_enabled: 223 ↛ 245line 223 didn't jump to line 245 because the condition on line 223 was always true

224 search_engines["library"] = { 

225 "module_path": "local_deep_research.web_search_engines.engines.search_engine_library", 

226 "class_name": "LibraryRAGSearchEngine", 

227 "requires_llm": True, 

228 "display_name": "Search All Collections", 

229 "default_params": {}, 

230 "description": "Search across all your document collections using semantic search", 

231 "strengths": [ 

232 "Searches all your curated collections of research papers and documents", 

233 "Uses semantic search for better relevance", 

234 "Returns documents you've already saved and reviewed", 

235 ], 

236 "weaknesses": [ 

237 "Limited to documents already in your collections", 

238 "Requires documents to be indexed first", 

239 ], 

240 "reliability": "High - searches all your collections", 

241 } 

242 logger.info("Registered Library RAG as search engine") 

243 

244 # Register document collections as individual search engines 

245 if library_enabled: 245 ↛ 302line 245 didn't jump to line 302 because the condition on line 245 was always true

246 try: 

247 from ..database.models.library import Collection 

248 from ..database.session_context import get_user_db_session 

249 

250 # Get username from settings_snapshot if available 

251 collection_username = ( 

252 settings_snapshot.get("_username") 

253 if settings_snapshot 

254 else username 

255 ) 

256 

257 if collection_username: 

258 with get_user_db_session(collection_username) as session: 

259 collections = session.query(Collection).all() 

260 

261 for collection in collections: 

262 engine_id = f"collection_{collection.id}" 

263 # Add suffix to distinguish from the all-collections search 

264 display_name = f"{collection.name} (Collection)" 

265 search_engines[engine_id] = { 

266 "module_path": "local_deep_research.web_search_engines.engines.search_engine_collection", 

267 "class_name": "CollectionSearchEngine", 

268 "requires_llm": True, 

269 "is_local": True, 

270 "display_name": display_name, 

271 "default_params": { 

272 "collection_id": collection.id, 

273 "collection_name": collection.name, 

274 }, 

275 "description": ( 

276 collection.description 

277 if collection.description 

278 else f"Search documents in {collection.name} collection only" 

279 ), 

280 "strengths": [ 

281 f"Searches only documents in {collection.name}", 

282 "Focused semantic search within specific topic area", 

283 "Returns documents from a curated collection", 

284 ], 

285 "weaknesses": [ 

286 "Limited to documents in this collection", 

287 "Smaller result pool than full library search", 

288 ], 

289 "reliability": "High - searches a specific collection", 

290 } 

291 

292 logger.info( 

293 f"Registered {len(collections)} document collections as search engines" 

294 ) 

295 else: 

296 logger.debug( 

297 "No username available for collection registration" 

298 ) 

299 except Exception as e: 

300 logger.warning(f"Could not register document collections: {e}") 

301 

302 return search_engines 

303 

304 

305def default_search_engine( 

306 username: Optional[str] = None, 

307 db_session: Optional[Session] = None, 

308 settings_snapshot: Optional[Dict[str, Any]] = None, 

309) -> str: 

310 """ 

311 Returns the configured default search engine. 

312 

313 Args: 

314 username: Username for backward compatibility (deprecated) 

315 db_session: Database session for direct access (preferred for web routes) 

316 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

317 

318 Returns: 

319 The configured default search engine. 

320 """ 

321 return _get_setting( 

322 "search.engine.DEFAULT_SEARCH_ENGINE", 

323 "wikipedia", 

324 db_session=db_session, 

325 settings_snapshot=settings_snapshot, 

326 username=username, 

327 ) 

328 

329 

330def local_search_engines( 

331 username: Optional[str] = None, 

332 db_session: Optional[Session] = None, 

333 settings_snapshot: Optional[Dict[str, Any]] = None, 

334) -> List[str]: 

335 """ 

336 Returns a list of the enabled local search engines. 

337 

338 Args: 

339 username: Username for backward compatibility (deprecated) 

340 db_session: Database session for direct access (preferred for web routes) 

341 settings_snapshot: Settings snapshot for thread context (preferred for background threads) 

342 

343 Returns: 

344 A list of the enabled local search engines. 

345 """ 

346 local_collections_data = _get_setting( 

347 "search.engine.local", 

348 {}, 

349 db_session=db_session, 

350 settings_snapshot=settings_snapshot, 

351 username=username, 

352 ) 

353 local_collections_data = _extract_per_engine_config(local_collections_data) 

354 

355 # Don't include the `local_all` collection. 

356 local_collections_data.pop("local_all", None) 

357 # Remove disabled collections. 

358 local_collections_data = { 

359 k: v 

360 for k, v in local_collections_data.items() 

361 if v.get("enabled", True) 

362 } 

363 

364 enabled_collections = list(local_collections_data.keys()) 

365 logger.debug(f"Using local collections: {enabled_collections}") 

366 return enabled_collections