Coverage for src / local_deep_research / security / module_whitelist.py: 100%

41 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Module whitelist for safe dynamic imports. 

3 

4This module provides secure dynamic import functionality with a strict whitelist 

5of allowed modules and class names. This prevents arbitrary code execution through 

6user-controlled configuration values. 

7 

8Security Features: 

9- Validates module paths against a whitelist of trusted modules 

10- Validates class names against a whitelist of legitimate search engine classes 

11- Prevents loading of arbitrary code through malicious configuration 

12- Logs security-relevant events for auditing 

13""" 

14 

15import importlib 

16from typing import Optional, Type 

17 

18from loguru import logger 

19 

20 

21# Whitelist of allowed module paths for dynamic import. 

22# Only modules in this list can be loaded dynamically. 

23# SECURITY: Only relative paths (starting with ".") are allowed. 

24# This ensures all imports are relative to local_deep_research.web_search_engines 

25# and prevents arbitrary module imports from other packages. 

26ALLOWED_MODULE_PATHS: frozenset[str] = frozenset( 

27 [ 

28 # Relative imports only (relative to local_deep_research.web_search_engines) 

29 # Absolute paths are NOT allowed for security reasons 

30 ".engines.full_search", 

31 ".engines.full_serp_search_results_old", 

32 ".engines.meta_search_engine", 

33 ".engines.parallel_search_engine", 

34 ".engines.search_engine_arxiv", 

35 ".engines.search_engine_brave", 

36 ".engines.search_engine_collection", 

37 ".engines.search_engine_ddg", 

38 ".engines.search_engine_elasticsearch", 

39 ".engines.search_engine_github", 

40 ".engines.search_engine_google_pse", 

41 ".engines.search_engine_guardian", 

42 ".engines.search_engine_library", 

43 ".engines.search_engine_local", 

44 ".engines.search_engine_local_all", 

45 ".engines.search_engine_mojeek", 

46 ".engines.search_engine_nasa_ads", 

47 ".engines.search_engine_openalex", 

48 ".engines.search_engine_paperless", 

49 ".engines.search_engine_pubmed", 

50 ".engines.search_engine_retriever", 

51 ".engines.search_engine_scaleserp", 

52 ".engines.search_engine_searxng", 

53 ".engines.search_engine_semantic_scholar", 

54 ".engines.search_engine_serper", 

55 ".engines.search_engine_serpapi", 

56 ".engines.search_engine_tavily", 

57 ".engines.search_engine_wayback", 

58 ".engines.search_engine_wikipedia", 

59 ".engines.search_engine_wikinews", 

60 ".search_engine_base", 

61 ] 

62) 

63 

64# Legacy alias for backward compatibility 

65ALLOWED_MODULES = ALLOWED_MODULE_PATHS 

66 

67 

68# Whitelist of allowed class names for search engines. 

69# Only classes in this list can be instantiated through dynamic import. 

70ALLOWED_CLASS_NAMES: frozenset[str] = frozenset( 

71 [ 

72 # Search engine implementation classes 

73 "ArXivSearchEngine", 

74 "BaseSearchEngine", 

75 "BraveSearchEngine", 

76 "CollectionSearchEngine", 

77 "DuckDuckGoSearchEngine", 

78 "ElasticsearchSearchEngine", 

79 "FullSearchResults", 

80 "GitHubSearchEngine", 

81 "GooglePSESearchEngine", 

82 "GuardianSearchEngine", 

83 "LibraryRAGSearchEngine", 

84 "LocalAllSearchEngine", 

85 "LocalSearchEngine", 

86 "MetaSearchEngine", 

87 "MojeekSearchEngine", 

88 "NasaAdsSearchEngine", 

89 "OpenAlexSearchEngine", 

90 "PaperlessSearchEngine", 

91 "ParallelSearchEngine", 

92 "PubMedSearchEngine", 

93 "RetrieverSearchEngine", 

94 "ScaleSerpSearchEngine", 

95 "SearXNGSearchEngine", 

96 "SemanticScholarSearchEngine", 

97 "SerpAPISearchEngine", 

98 "SerperSearchEngine", 

99 "TavilySearchEngine", 

100 "WaybackSearchEngine", 

101 "WikinewsSearchEngine", 

102 "WikipediaSearchEngine", 

103 ] 

104) 

105 

106 

107class SecurityError(Exception): 

108 """Raised when a security validation fails during module import.""" 

109 

110 pass 

111 

112 

113# Legacy alias for backward compatibility 

114ModuleNotAllowedError = SecurityError 

115 

116 

117def validate_module_import(module_path: str, class_name: str) -> bool: 

118 """ 

119 Validate that both module_path and class_name are in their respective whitelists. 

120 

121 This function provides a security check to ensure that dynamically loaded 

122 search engine modules are from trusted sources only. It validates: 

123 1. Module path starts with "." (relative import only) 

124 2. Module path is in the whitelist 

125 3. Class name is in the whitelist 

126 

127 Args: 

128 module_path: The Python module path (MUST be relative, starting with ".") 

129 class_name: The class name to import from the module 

130 

131 Returns: 

132 True if all validations pass, False otherwise 

133 

134 Example: 

135 >>> validate_module_import(".engines.search_engine_brave", "BraveSearchEngine") 

136 True 

137 >>> validate_module_import("os", "system") 

138 False 

139 """ 

140 if not module_path or not class_name: 

141 logger.warning( 

142 "Module validation failed: empty module_path or class_name" 

143 ) 

144 return False 

145 

146 # SECURITY: Only allow relative imports (starting with ".") 

147 # This ensures imports are relative to local_deep_research.web_search_engines 

148 # and prevents loading arbitrary modules like "os" or "subprocess" 

149 if not module_path.startswith("."): 

150 logger.warning( 

151 f"Security: Rejected non-relative module path: {module_path}. " 

152 f"Only relative imports (starting with '.') are allowed." 

153 ) 

154 return False 

155 

156 module_valid = module_path in ALLOWED_MODULE_PATHS 

157 class_valid = class_name in ALLOWED_CLASS_NAMES 

158 

159 if not module_valid: 

160 logger.warning(f"Module path not in whitelist: {module_path}") 

161 

162 if not class_valid: 

163 logger.warning(f"Class name not in whitelist: {class_name}") 

164 

165 return module_valid and class_valid 

166 

167 

168def get_safe_module_class( 

169 module_path: str, 

170 class_name: str, 

171 package: Optional[str] = None, 

172) -> Type: 

173 """ 

174 Safely import a class from a module, validating against both whitelists. 

175 

176 This function provides secure dynamic import functionality by: 

177 1. Validating the module path against a strict whitelist 

178 2. Validating the class name against a strict whitelist 

179 3. Only allowing imports from trusted local_deep_research modules 

180 4. Preventing arbitrary code execution through configuration 

181 

182 Args: 

183 module_path: The module path to import from 

184 (e.g., "local_deep_research.web_search_engines.engines.search_engine_brave" 

185 or ".engines.search_engine_brave" for relative imports) 

186 class_name: The class name to retrieve from the module 

187 package: Optional package for relative imports. If not provided and 

188 module_path starts with ".", defaults to 

189 "local_deep_research.web_search_engines" 

190 

191 Returns: 

192 The requested class from the module 

193 

194 Raises: 

195 SecurityError: If the module path or class name is not in the whitelist 

196 ModuleNotFoundError: If the module cannot be imported 

197 AttributeError: If the class does not exist in the module 

198 

199 Example: 

200 >>> cls = get_safe_module_class(".engines.search_engine_brave", "BraveSearchEngine") 

201 >>> engine = cls(api_key="...") 

202 """ 

203 # Validate both module path and class name against whitelists 

204 if not validate_module_import(module_path, class_name): 

205 logger.error( 

206 f"Security: Blocked attempt to import non-whitelisted module/class: " 

207 f"module_path={module_path!r}, class_name={class_name!r}" 

208 ) 

209 raise SecurityError( 

210 f"Import blocked: module_path={module_path!r} or class_name={class_name!r} " 

211 f"is not in the security whitelist. Only trusted local_deep_research " 

212 f"modules and classes can be dynamically imported." 

213 ) 

214 

215 # Determine package for relative imports 

216 if package is None and module_path.startswith("."): 

217 package = "local_deep_research.web_search_engines" 

218 

219 # Import the module 

220 try: 

221 # bearer:disable python_lang_code_injection 

222 module = importlib.import_module(module_path, package=package) 

223 except ModuleNotFoundError: 

224 logger.exception(f"Failed to import whitelisted module {module_path}") 

225 raise 

226 

227 # Get the class from the module 

228 try: 

229 engine_class = getattr(module, class_name) 

230 except AttributeError: 

231 logger.exception( 

232 f"Class '{class_name}' not found in module '{module_path}'" 

233 ) 

234 raise 

235 

236 logger.debug(f"Successfully loaded {class_name} from {module_path}") 

237 return engine_class