Coverage for src / local_deep_research / security / url_validator.py: 91%
120 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Centralized URL validation utilities for security.
4This module provides secure URL validation to prevent XSS attacks,
5data exfiltration, and other URL-based security vulnerabilities.
6"""
8import re
9from typing import Optional, List
10from urllib.parse import urlparse
11from loguru import logger
14class URLValidationError(ValueError):
15 """Raised when URL construction or validation fails."""
17 pass
20class URLValidator:
21 """Centralized URL validation for security."""
23 # Unsafe URL schemes that could lead to XSS or data exfiltration
24 UNSAFE_SCHEMES = (
25 "javascript",
26 "data",
27 "vbscript",
28 "about",
29 "blob",
30 "file",
31 )
33 # Safe schemes for external links
34 SAFE_SCHEMES = ("http", "https", "ftp", "ftps")
36 # Email scheme
37 EMAIL_SCHEME = "mailto"
39 # Common academic/research domains that should be allowed
40 TRUSTED_ACADEMIC_DOMAINS = (
41 "arxiv.org",
42 "pubmed.ncbi.nlm.nih.gov",
43 "ncbi.nlm.nih.gov",
44 "biorxiv.org",
45 "medrxiv.org",
46 "doi.org",
47 "nature.com",
48 "science.org",
49 "sciencedirect.com",
50 "springer.com",
51 "wiley.com",
52 "plos.org",
53 "pnas.org",
54 "ieee.org",
55 "acm.org",
56 )
58 @staticmethod
59 def is_unsafe_scheme(url: str) -> bool:
60 """
61 Check if a URL uses an unsafe scheme.
63 Args:
64 url: The URL to check
66 Returns:
67 True if the URL uses an unsafe scheme, False otherwise
68 """
69 if not url:
70 return False
72 # Normalize the URL - trim whitespace and convert to lowercase
73 normalized_url = url.strip().lower()
75 # Check for unsafe schemes
76 for scheme in URLValidator.UNSAFE_SCHEMES:
77 if normalized_url.startswith(f"{scheme}:"):
78 logger.warning(
79 f"Unsafe URL scheme detected: {scheme} in URL: {url[:100]}"
80 )
81 return True
83 return False
85 @staticmethod
86 def is_safe_url(
87 url: str,
88 require_scheme: bool = True,
89 allow_fragments: bool = True,
90 allow_mailto: bool = False,
91 trusted_domains: Optional[List[str]] = None,
92 ) -> bool:
93 """
94 Validate if a URL is safe to use.
96 Args:
97 url: The URL to validate
98 require_scheme: Whether to require an explicit scheme
99 allow_fragments: Whether to allow fragment identifiers (#)
100 allow_mailto: Whether to allow mailto: links
101 trusted_domains: Optional list of trusted domains
103 Returns:
104 True if the URL is safe, False otherwise
105 """
106 if not url or not isinstance(url, str):
107 return False
109 # Check for unsafe schemes first
110 if URLValidator.is_unsafe_scheme(url):
111 return False
113 # Handle fragment-only URLs
114 if url.startswith("#"):
115 return allow_fragments
117 # Parse the URL
118 try:
119 parsed = urlparse(url)
120 except Exception as e:
121 logger.warning(f"Failed to parse URL '{url[:100]}': {e}")
122 return False
124 # Check scheme
125 if not parsed.scheme:
126 if require_scheme:
127 return False
128 # If no scheme is required, assume http/https for URL parsing
129 parsed = urlparse(f"http://{url}") # DevSkim: ignore DS137138
131 scheme_lower = parsed.scheme.lower()
133 # Check if it's a mailto link
134 if scheme_lower == URLValidator.EMAIL_SCHEME:
135 return allow_mailto
137 # Check if it's a safe scheme
138 if scheme_lower not in URLValidator.SAFE_SCHEMES:
139 logger.warning(f"Unsafe URL scheme: {scheme_lower}")
140 return False
142 # Validate domain if trusted domains are specified
143 if trusted_domains and parsed.hostname:
144 hostname_lower = parsed.hostname.lower()
145 if not any(
146 hostname_lower == domain.lower()
147 or hostname_lower.endswith(f".{domain.lower()}")
148 for domain in trusted_domains
149 ):
150 logger.warning(
151 f"URL domain not in trusted list: {parsed.hostname}"
152 )
153 return False
155 # Check for suspicious patterns in the URL
156 if URLValidator._has_suspicious_patterns(url):
157 return False
159 return True
161 @staticmethod
162 def _has_suspicious_patterns(url: str) -> bool:
163 """
164 Check for suspicious patterns in URLs that might indicate attacks.
166 Args:
167 url: The URL to check
169 Returns:
170 True if suspicious patterns are found, False otherwise
171 """
172 suspicious_patterns = [
173 # Double encoding
174 r"%25[0-9a-fA-F]{2}",
175 # Null bytes
176 r"%00",
177 # Unicode encoding bypass attempts
178 r"\\u[0-9a-fA-F]{4}",
179 # HTML entity encoding
180 r"&(#x?[0-9a-fA-F]+|[a-zA-Z]+);",
181 ]
183 for pattern in suspicious_patterns:
184 if re.search(pattern, url, re.IGNORECASE):
185 logger.warning(f"Suspicious pattern found in URL: {pattern}")
186 return True
188 return False
190 @staticmethod
191 def sanitize_url(url: str, default_scheme: str = "https") -> Optional[str]:
192 """
193 Sanitize a URL by adding a scheme if missing and validating it.
195 Args:
196 url: The URL to sanitize
197 default_scheme: The default scheme to add if missing
199 Returns:
200 Sanitized URL or None if the URL is unsafe
201 """
202 if not url:
203 return None
205 # Check for unsafe schemes
206 if URLValidator.is_unsafe_scheme(url):
207 return None
209 # Strip whitespace
210 url = url.strip()
212 # Parse the URL
213 try:
214 parsed = urlparse(url)
216 # Add scheme if missing
217 if not parsed.scheme:
218 url = f"{default_scheme}://{url}"
219 parsed = urlparse(url)
221 # Validate the final URL
222 if URLValidator.is_safe_url(url, require_scheme=True): 222 ↛ 228line 222 didn't jump to line 228 because the condition on line 222 was always true
223 return url
225 except Exception as e:
226 logger.warning(f"Failed to sanitize URL '{url[:100]}': {e}")
228 return None
230 @staticmethod
231 def is_academic_url(url: str) -> bool:
232 """
233 Check if a URL is from a known academic/research domain.
235 Args:
236 url: The URL to check
238 Returns:
239 True if the URL is from an academic domain, False otherwise
240 """
241 try:
242 parsed = urlparse(url)
243 if parsed.hostname:
244 hostname_lower = parsed.hostname.lower()
245 return any(
246 hostname_lower == domain
247 or hostname_lower.endswith(f".{domain}")
248 for domain in URLValidator.TRUSTED_ACADEMIC_DOMAINS
249 )
250 except Exception:
251 pass
253 return False
255 @staticmethod
256 def extract_doi(url: str) -> Optional[str]:
257 """
258 Extract DOI from a URL if present.
260 Args:
261 url: The URL to extract DOI from
263 Returns:
264 The DOI if found, None otherwise
265 """
266 # Common DOI patterns with explicit pattern identification
267 doi_patterns = [
268 (
269 r"10\.\d{4,}(?:\.\d+)*\/[-._;()\/:a-zA-Z0-9]+",
270 0,
271 ), # Direct DOI, group 0
272 (r"doi\.org\/(10\.\d{4,}[^\s]*)", 1), # doi.org URL, group 1
273 ]
275 for pattern, group_index in doi_patterns:
276 match = re.search(pattern, url, re.IGNORECASE)
277 if match:
278 return match.group(group_index)
280 return None
282 @staticmethod
283 def validate_http_url(url: str) -> bool:
284 """
285 Validate that a callback URL is well-formed and safe for HTTP/HTTPS use.
287 This is stricter than is_safe_url() and specifically validates HTTP/HTTPS
288 URLs for use as application callbacks (e.g., in notifications, redirects).
289 It does NOT validate Apprise service URLs which use other protocols.
291 Args:
292 url: HTTP/HTTPS callback URL to validate
294 Returns:
295 True if valid
297 Raises:
298 URLValidationError: If URL is invalid
299 """
300 if not url or not isinstance(url, str):
301 raise URLValidationError("URL must be a non-empty string")
303 try:
304 parsed = urlparse(url)
306 # Must have a scheme
307 if not parsed.scheme:
308 raise URLValidationError(
309 "URL must have a scheme (http or https)"
310 )
312 # Must be http or https (callback URLs only)
313 if parsed.scheme not in ("http", "https"):
314 raise URLValidationError(
315 f"URL scheme must be http or https, got: {parsed.scheme}"
316 )
318 # Use the general security validator for additional safety
319 if not URLValidator.is_safe_url(url, require_scheme=True): 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true
320 raise URLValidationError(
321 f"URL failed security validation: {url}"
322 )
324 # Must have a netloc (hostname)
325 if not parsed.netloc:
326 raise URLValidationError("URL must have a hostname")
328 # Check for obvious hostname issues
329 if parsed.netloc.startswith(".") or parsed.netloc.endswith("."):
330 raise URLValidationError(f"Invalid hostname: {parsed.netloc}")
332 # Path should be valid if present
333 if parsed.path and not parsed.path.startswith("/"): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 raise URLValidationError(
335 f"URL path must start with /: {parsed.path}"
336 )
338 return True
340 except Exception as e:
341 if isinstance(e, URLValidationError): 341 ↛ 343line 341 didn't jump to line 343 because the condition on line 341 was always true
342 raise
343 raise URLValidationError(f"Failed to validate URL: {e}")
346def get_javascript_url_validator() -> str:
347 """
348 Get JavaScript code for URL validation that matches the Python implementation.
350 Returns:
351 JavaScript code as a string that can be embedded in web pages
352 """
353 return r"""
354 // URL validation utilities matching Python URLValidator
355 const URLValidator = {
356 UNSAFE_SCHEMES: ['javascript', 'data', 'vbscript', 'about', 'blob', 'file'],
357 SAFE_SCHEMES: ['http', 'https', 'ftp', 'ftps'],
358 EMAIL_SCHEME: 'mailto',
360 isUnsafeScheme: function(url) {
361 if (!url) return false;
363 const normalizedUrl = url.trim().toLowerCase();
365 for (const scheme of this.UNSAFE_SCHEMES) {
366 if (normalizedUrl.startsWith(scheme + ':')) {
367 console.warn(`Unsafe URL scheme detected: ${scheme}`);
368 return true;
369 }
370 }
372 return false;
373 },
375 isSafeUrl: function(url, options = {}) {
376 const {
377 requireScheme = true,
378 allowFragments = true,
379 allowMailto = false,
380 trustedDomains = []
381 } = options;
383 if (!url || typeof url !== 'string') {
384 return false;
385 }
387 // Check for unsafe schemes first
388 if (this.isUnsafeScheme(url)) {
389 return false;
390 }
392 // Handle fragment-only URLs
393 if (url.startsWith('#')) {
394 return allowFragments;
395 }
397 // Parse the URL
398 try {
399 const parsed = new URL(url, window.location.href);
400 const scheme = parsed.protocol.slice(0, -1).toLowerCase(); // Remove trailing ':'
402 // Check if it's a mailto link
403 if (scheme === this.EMAIL_SCHEME) {
404 return allowMailto;
405 }
407 // Check if it's a safe scheme
408 if (!this.SAFE_SCHEMES.includes(scheme)) {
409 console.warn(`Unsafe URL scheme: ${scheme}`);
410 return false;
411 }
413 // Validate domain if trusted domains are specified
414 if (trustedDomains.length > 0 && parsed.hostname) {
415 const hostname = parsed.hostname.toLowerCase();
416 const isTrusted = trustedDomains.some(domain =>
417 hostname === domain.toLowerCase() ||
418 hostname.endsWith('.' + domain.toLowerCase())
419 );
421 if (!isTrusted) {
422 console.warn(`URL domain not in trusted list: ${parsed.hostname}`);
423 return false;
424 }
425 }
427 return true;
428 } catch (e) {
429 console.warn(`Failed to parse URL: ${e.message}`);
430 return false;
431 }
432 },
434 sanitizeUrl: function(url, defaultScheme = 'https') {
435 if (!url) return null;
437 // Check for unsafe schemes
438 if (this.isUnsafeScheme(url)) {
439 return null;
440 }
442 // Strip whitespace
443 url = url.trim();
445 // Add scheme if missing
446 if (!url.match(/^[a-zA-Z][a-zA-Z\d+\-.]*:/)) {
447 url = `${defaultScheme}://${url}`;
448 }
450 // Validate the final URL
451 if (this.isSafeUrl(url, { requireScheme: true })) {
452 return url;
453 }
455 return null;
456 }
457 };
458 """