Coverage for src / local_deep_research / security / module_whitelist.py: 100%
41 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Module whitelist for safe dynamic imports.
4This module provides secure dynamic import functionality with a strict whitelist
5of allowed modules and class names. This prevents arbitrary code execution through
6user-controlled configuration values.
8Security Features:
9- Validates module paths against a whitelist of trusted modules
10- Validates class names against a whitelist of legitimate search engine classes
11- Prevents loading of arbitrary code through malicious configuration
12- Logs security-relevant events for auditing
13"""
15import importlib
16from typing import Optional, Type
18from loguru import logger
21# Whitelist of allowed module paths for dynamic import.
22# Only modules in this list can be loaded dynamically.
23# SECURITY: Only relative paths (starting with ".") are allowed.
24# This ensures all imports are relative to local_deep_research.web_search_engines
25# and prevents arbitrary module imports from other packages.
26ALLOWED_MODULE_PATHS: frozenset[str] = frozenset(
27 [
28 # Relative imports only (relative to local_deep_research.web_search_engines)
29 # Absolute paths are NOT allowed for security reasons
30 ".engines.full_search",
31 ".engines.meta_search_engine",
32 ".engines.parallel_search_engine",
33 ".engines.search_engine_arxiv",
34 ".engines.search_engine_brave",
35 ".engines.search_engine_collection",
36 ".engines.search_engine_ddg",
37 ".engines.search_engine_elasticsearch",
38 ".engines.search_engine_exa",
39 ".engines.search_engine_github",
40 ".engines.search_engine_google_pse",
41 ".engines.search_engine_guardian",
42 ".engines.search_engine_library",
43 ".engines.local_embedding_manager",
44 ".engines.search_engine_mojeek",
45 ".engines.search_engine_nasa_ads",
46 ".engines.search_engine_openalex",
47 ".engines.search_engine_paperless",
48 ".engines.search_engine_pubmed",
49 ".engines.search_engine_retriever",
50 ".engines.search_engine_scaleserp",
51 ".engines.search_engine_searxng",
52 ".engines.search_engine_semantic_scholar",
53 ".engines.search_engine_serper",
54 ".engines.search_engine_serpapi",
55 ".engines.search_engine_tavily",
56 ".engines.search_engine_gutenberg",
57 ".engines.search_engine_openlibrary",
58 ".engines.search_engine_pubchem",
59 ".engines.search_engine_stackexchange",
60 ".engines.search_engine_wayback",
61 ".engines.search_engine_wikipedia",
62 ".engines.search_engine_wikinews",
63 ".engines.search_engine_zenodo",
64 ".search_engine_base",
65 ]
66)
68# Legacy alias for backward compatibility
69ALLOWED_MODULES = ALLOWED_MODULE_PATHS
72# Whitelist of allowed class names for search engines.
73# Only classes in this list can be instantiated through dynamic import.
74ALLOWED_CLASS_NAMES: frozenset[str] = frozenset(
75 [
76 # Search engine implementation classes
77 "ArXivSearchEngine",
78 "BaseSearchEngine",
79 "BraveSearchEngine",
80 "CollectionSearchEngine",
81 "DuckDuckGoSearchEngine",
82 "ElasticsearchSearchEngine",
83 "GutenbergSearchEngine",
84 "ExaSearchEngine",
85 "FullSearchResults",
86 "GitHubSearchEngine",
87 "GooglePSESearchEngine",
88 "GuardianSearchEngine",
89 "LibraryRAGSearchEngine",
90 "MetaSearchEngine",
91 "MojeekSearchEngine",
92 "NasaAdsSearchEngine",
93 "OpenAlexSearchEngine",
94 "OpenLibrarySearchEngine",
95 "PaperlessSearchEngine",
96 "ParallelSearchEngine",
97 "PubChemSearchEngine",
98 "PubMedSearchEngine",
99 "RetrieverSearchEngine",
100 "ScaleSerpSearchEngine",
101 "SearXNGSearchEngine",
102 "SemanticScholarSearchEngine",
103 "SerpAPISearchEngine",
104 "SerperSearchEngine",
105 "StackExchangeSearchEngine",
106 "TavilySearchEngine",
107 "WaybackSearchEngine",
108 "WikinewsSearchEngine",
109 "WikipediaSearchEngine",
110 "ZenodoSearchEngine",
111 ]
112)
115class SecurityError(Exception):
116 """Raised when a security validation fails during module import."""
118 pass
121# Legacy alias for backward compatibility
122ModuleNotAllowedError = SecurityError
125def validate_module_import(module_path: str, class_name: str) -> bool:
126 """
127 Validate that both module_path and class_name are in their respective whitelists.
129 This function provides a security check to ensure that dynamically loaded
130 search engine modules are from trusted sources only. It validates:
131 1. Module path starts with "." (relative import only)
132 2. Module path is in the whitelist
133 3. Class name is in the whitelist
135 Args:
136 module_path: The Python module path (MUST be relative, starting with ".")
137 class_name: The class name to import from the module
139 Returns:
140 True if all validations pass, False otherwise
142 Example:
143 >>> validate_module_import(".engines.search_engine_brave", "BraveSearchEngine")
144 True
145 >>> validate_module_import("os", "system")
146 False
147 """
148 if not module_path or not class_name:
149 logger.warning(
150 "Module validation failed: empty module_path or class_name"
151 )
152 return False
154 # SECURITY: Only allow relative imports (starting with ".")
155 # This ensures imports are relative to local_deep_research.web_search_engines
156 # and prevents loading arbitrary modules like "os" or "subprocess"
157 if not module_path.startswith("."):
158 logger.warning(
159 f"Security: Rejected non-relative module path: {module_path}. "
160 f"Only relative imports (starting with '.') are allowed."
161 )
162 return False
164 module_valid = module_path in ALLOWED_MODULE_PATHS
165 class_valid = class_name in ALLOWED_CLASS_NAMES
167 if not module_valid:
168 logger.warning(f"Module path not in whitelist: {module_path}")
170 if not class_valid:
171 logger.warning(f"Class name not in whitelist: {class_name}")
173 return module_valid and class_valid
176def get_safe_module_class(
177 module_path: str,
178 class_name: str,
179 package: Optional[str] = None,
180) -> Type:
181 """
182 Safely import a class from a module, validating against both whitelists.
184 This function provides secure dynamic import functionality by:
185 1. Validating the module path against a strict whitelist
186 2. Validating the class name against a strict whitelist
187 3. Only allowing imports from trusted local_deep_research modules
188 4. Preventing arbitrary code execution through configuration
190 Args:
191 module_path: The module path to import from
192 (e.g., ".engines.search_engine_brave" for relative imports)
193 class_name: The class name to retrieve from the module
194 package: Optional package for relative imports. If not provided and
195 module_path starts with ".", defaults to
196 "local_deep_research.web_search_engines"
198 Returns:
199 The requested class from the module
201 Raises:
202 SecurityError: If the module path or class name is not in the whitelist
203 ModuleNotFoundError: If the module cannot be imported
204 AttributeError: If the class does not exist in the module
206 Example:
207 >>> cls = get_safe_module_class(".engines.search_engine_brave", "BraveSearchEngine")
208 >>> engine = cls(api_key="...")
209 """
210 # Validate both module path and class name against whitelists
211 if not validate_module_import(module_path, class_name):
212 logger.error(
213 f"Security: Blocked attempt to import non-whitelisted module/class: "
214 f"module_path={module_path!r}, class_name={class_name!r}"
215 )
216 raise SecurityError(
217 f"Import blocked: module_path={module_path!r} or class_name={class_name!r} "
218 f"is not in the security whitelist. Only trusted local_deep_research "
219 f"modules and classes can be dynamically imported."
220 )
222 # Determine package for relative imports
223 if package is None and module_path.startswith("."):
224 package = "local_deep_research.web_search_engines"
226 # Import the module
227 try:
228 # bearer:disable python_lang_code_injection
229 module = importlib.import_module(module_path, package=package)
230 except ModuleNotFoundError:
231 logger.exception(f"Failed to import whitelisted module {module_path}")
232 raise
234 # Get the class from the module
235 try:
236 engine_class = getattr(module, class_name)
237 except AttributeError:
238 logger.exception(
239 f"Class '{class_name}' not found in module '{module_path}'"
240 )
241 raise
243 logger.debug(f"Successfully loaded {class_name} from {module_path}")
244 return engine_class