Coverage for src / local_deep_research / security / module_whitelist.py: 100%

41 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Module whitelist for safe dynamic imports. 

3 

4This module provides secure dynamic import functionality with a strict whitelist 

5of allowed modules and class names. This prevents arbitrary code execution through 

6user-controlled configuration values. 

7 

8Security Features: 

9- Validates module paths against a whitelist of trusted modules 

10- Validates class names against a whitelist of legitimate search engine classes 

11- Prevents loading of arbitrary code through malicious configuration 

12- Logs security-relevant events for auditing 

13""" 

14 

15import importlib 

16from typing import Optional, Type 

17 

18from loguru import logger 

19 

20 

21# Whitelist of allowed module paths for dynamic import. 

22# Only modules in this list can be loaded dynamically. 

23# SECURITY: Only relative paths (starting with ".") are allowed. 

24# This ensures all imports are relative to local_deep_research.web_search_engines 

25# and prevents arbitrary module imports from other packages. 

26ALLOWED_MODULE_PATHS: frozenset[str] = frozenset( 

27 [ 

28 # Relative imports only (relative to local_deep_research.web_search_engines) 

29 # Absolute paths are NOT allowed for security reasons 

30 ".engines.full_search", 

31 ".engines.meta_search_engine", 

32 ".engines.parallel_search_engine", 

33 ".engines.search_engine_arxiv", 

34 ".engines.search_engine_brave", 

35 ".engines.search_engine_collection", 

36 ".engines.search_engine_ddg", 

37 ".engines.search_engine_elasticsearch", 

38 ".engines.search_engine_exa", 

39 ".engines.search_engine_github", 

40 ".engines.search_engine_google_pse", 

41 ".engines.search_engine_guardian", 

42 ".engines.search_engine_library", 

43 ".engines.local_embedding_manager", 

44 ".engines.search_engine_mojeek", 

45 ".engines.search_engine_nasa_ads", 

46 ".engines.search_engine_openalex", 

47 ".engines.search_engine_paperless", 

48 ".engines.search_engine_pubmed", 

49 ".engines.search_engine_retriever", 

50 ".engines.search_engine_scaleserp", 

51 ".engines.search_engine_searxng", 

52 ".engines.search_engine_semantic_scholar", 

53 ".engines.search_engine_serper", 

54 ".engines.search_engine_serpapi", 

55 ".engines.search_engine_tavily", 

56 ".engines.search_engine_gutenberg", 

57 ".engines.search_engine_openlibrary", 

58 ".engines.search_engine_pubchem", 

59 ".engines.search_engine_stackexchange", 

60 ".engines.search_engine_wayback", 

61 ".engines.search_engine_wikipedia", 

62 ".engines.search_engine_wikinews", 

63 ".engines.search_engine_zenodo", 

64 ".search_engine_base", 

65 ] 

66) 

67 

68# Legacy alias for backward compatibility 

69ALLOWED_MODULES = ALLOWED_MODULE_PATHS 

70 

71 

72# Whitelist of allowed class names for search engines. 

73# Only classes in this list can be instantiated through dynamic import. 

74ALLOWED_CLASS_NAMES: frozenset[str] = frozenset( 

75 [ 

76 # Search engine implementation classes 

77 "ArXivSearchEngine", 

78 "BaseSearchEngine", 

79 "BraveSearchEngine", 

80 "CollectionSearchEngine", 

81 "DuckDuckGoSearchEngine", 

82 "ElasticsearchSearchEngine", 

83 "GutenbergSearchEngine", 

84 "ExaSearchEngine", 

85 "FullSearchResults", 

86 "GitHubSearchEngine", 

87 "GooglePSESearchEngine", 

88 "GuardianSearchEngine", 

89 "LibraryRAGSearchEngine", 

90 "MetaSearchEngine", 

91 "MojeekSearchEngine", 

92 "NasaAdsSearchEngine", 

93 "OpenAlexSearchEngine", 

94 "OpenLibrarySearchEngine", 

95 "PaperlessSearchEngine", 

96 "ParallelSearchEngine", 

97 "PubChemSearchEngine", 

98 "PubMedSearchEngine", 

99 "RetrieverSearchEngine", 

100 "ScaleSerpSearchEngine", 

101 "SearXNGSearchEngine", 

102 "SemanticScholarSearchEngine", 

103 "SerpAPISearchEngine", 

104 "SerperSearchEngine", 

105 "StackExchangeSearchEngine", 

106 "TavilySearchEngine", 

107 "WaybackSearchEngine", 

108 "WikinewsSearchEngine", 

109 "WikipediaSearchEngine", 

110 "ZenodoSearchEngine", 

111 ] 

112) 

113 

114 

115class SecurityError(Exception): 

116 """Raised when a security validation fails during module import.""" 

117 

118 pass 

119 

120 

121# Legacy alias for backward compatibility 

122ModuleNotAllowedError = SecurityError 

123 

124 

125def validate_module_import(module_path: str, class_name: str) -> bool: 

126 """ 

127 Validate that both module_path and class_name are in their respective whitelists. 

128 

129 This function provides a security check to ensure that dynamically loaded 

130 search engine modules are from trusted sources only. It validates: 

131 1. Module path starts with "." (relative import only) 

132 2. Module path is in the whitelist 

133 3. Class name is in the whitelist 

134 

135 Args: 

136 module_path: The Python module path (MUST be relative, starting with ".") 

137 class_name: The class name to import from the module 

138 

139 Returns: 

140 True if all validations pass, False otherwise 

141 

142 Example: 

143 >>> validate_module_import(".engines.search_engine_brave", "BraveSearchEngine") 

144 True 

145 >>> validate_module_import("os", "system") 

146 False 

147 """ 

148 if not module_path or not class_name: 

149 logger.warning( 

150 "Module validation failed: empty module_path or class_name" 

151 ) 

152 return False 

153 

154 # SECURITY: Only allow relative imports (starting with ".") 

155 # This ensures imports are relative to local_deep_research.web_search_engines 

156 # and prevents loading arbitrary modules like "os" or "subprocess" 

157 if not module_path.startswith("."): 

158 logger.warning( 

159 f"Security: Rejected non-relative module path: {module_path}. " 

160 f"Only relative imports (starting with '.') are allowed." 

161 ) 

162 return False 

163 

164 module_valid = module_path in ALLOWED_MODULE_PATHS 

165 class_valid = class_name in ALLOWED_CLASS_NAMES 

166 

167 if not module_valid: 

168 logger.warning(f"Module path not in whitelist: {module_path}") 

169 

170 if not class_valid: 

171 logger.warning(f"Class name not in whitelist: {class_name}") 

172 

173 return module_valid and class_valid 

174 

175 

176def get_safe_module_class( 

177 module_path: str, 

178 class_name: str, 

179 package: Optional[str] = None, 

180) -> Type: 

181 """ 

182 Safely import a class from a module, validating against both whitelists. 

183 

184 This function provides secure dynamic import functionality by: 

185 1. Validating the module path against a strict whitelist 

186 2. Validating the class name against a strict whitelist 

187 3. Only allowing imports from trusted local_deep_research modules 

188 4. Preventing arbitrary code execution through configuration 

189 

190 Args: 

191 module_path: The module path to import from 

192 (e.g., ".engines.search_engine_brave" for relative imports) 

193 class_name: The class name to retrieve from the module 

194 package: Optional package for relative imports. If not provided and 

195 module_path starts with ".", defaults to 

196 "local_deep_research.web_search_engines" 

197 

198 Returns: 

199 The requested class from the module 

200 

201 Raises: 

202 SecurityError: If the module path or class name is not in the whitelist 

203 ModuleNotFoundError: If the module cannot be imported 

204 AttributeError: If the class does not exist in the module 

205 

206 Example: 

207 >>> cls = get_safe_module_class(".engines.search_engine_brave", "BraveSearchEngine") 

208 >>> engine = cls(api_key="...") 

209 """ 

210 # Validate both module path and class name against whitelists 

211 if not validate_module_import(module_path, class_name): 

212 logger.error( 

213 f"Security: Blocked attempt to import non-whitelisted module/class: " 

214 f"module_path={module_path!r}, class_name={class_name!r}" 

215 ) 

216 raise SecurityError( 

217 f"Import blocked: module_path={module_path!r} or class_name={class_name!r} " 

218 f"is not in the security whitelist. Only trusted local_deep_research " 

219 f"modules and classes can be dynamically imported." 

220 ) 

221 

222 # Determine package for relative imports 

223 if package is None and module_path.startswith("."): 

224 package = "local_deep_research.web_search_engines" 

225 

226 # Import the module 

227 try: 

228 # bearer:disable python_lang_code_injection 

229 module = importlib.import_module(module_path, package=package) 

230 except ModuleNotFoundError: 

231 logger.exception(f"Failed to import whitelisted module {module_path}") 

232 raise 

233 

234 # Get the class from the module 

235 try: 

236 engine_class = getattr(module, class_name) 

237 except AttributeError: 

238 logger.exception( 

239 f"Class '{class_name}' not found in module '{module_path}'" 

240 ) 

241 raise 

242 

243 logger.debug(f"Successfully loaded {class_name} from {module_path}") 

244 return engine_class