Coverage for src / local_deep_research / security / url_validator.py: 89%
146 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Centralized URL validation utilities for security.
4This module provides secure URL validation to prevent XSS attacks,
5data exfiltration, and other URL-based security vulnerabilities.
6"""
8import re
9from typing import Optional, List
10from urllib.parse import urlparse, urljoin
11from loguru import logger
14class URLValidationError(ValueError):
15 """Raised when URL construction or validation fails."""
17 pass
20class URLValidator:
21 """Centralized URL validation for security."""
23 # Unsafe URL schemes that could lead to XSS or data exfiltration
24 UNSAFE_SCHEMES = (
25 "javascript",
26 "data",
27 "vbscript",
28 "about",
29 "blob",
30 "file",
31 )
33 # Safe schemes for external links
34 SAFE_SCHEMES = ("http", "https", "ftp", "ftps")
36 # Email scheme
37 EMAIL_SCHEME = "mailto"
39 # Common academic/research domains that should be allowed
40 TRUSTED_ACADEMIC_DOMAINS = (
41 "arxiv.org",
42 "pubmed.ncbi.nlm.nih.gov",
43 "ncbi.nlm.nih.gov",
44 "biorxiv.org",
45 "medrxiv.org",
46 "doi.org",
47 "nature.com",
48 "science.org",
49 "sciencedirect.com",
50 "springer.com",
51 "wiley.com",
52 "plos.org",
53 "pnas.org",
54 "ieee.org",
55 "acm.org",
56 )
58 @staticmethod
59 def is_unsafe_scheme(url: str) -> bool:
60 """
61 Check if a URL uses an unsafe scheme.
63 Args:
64 url: The URL to check
66 Returns:
67 True if the URL uses an unsafe scheme, False otherwise
68 """
69 if not url:
70 return False
72 # Normalize the URL - trim whitespace and convert to lowercase
73 normalized_url = url.strip().lower()
75 # Check for unsafe schemes
76 for scheme in URLValidator.UNSAFE_SCHEMES:
77 if normalized_url.startswith(f"{scheme}:"):
78 logger.warning(
79 f"Unsafe URL scheme detected: {scheme} in URL: {url[:100]}"
80 )
81 return True
83 return False
85 @staticmethod
86 def is_safe_url(
87 url: str,
88 require_scheme: bool = True,
89 allow_fragments: bool = True,
90 allow_mailto: bool = False,
91 trusted_domains: Optional[List[str]] = None,
92 ) -> bool:
93 """
94 Validate if a URL is safe to use.
96 Args:
97 url: The URL to validate
98 require_scheme: Whether to require an explicit scheme
99 allow_fragments: Whether to allow fragment identifiers (#)
100 allow_mailto: Whether to allow mailto: links
101 trusted_domains: Optional list of trusted domains
103 Returns:
104 True if the URL is safe, False otherwise
105 """
106 if not url or not isinstance(url, str):
107 return False
109 # Check for unsafe schemes first
110 if URLValidator.is_unsafe_scheme(url):
111 return False
113 # Handle fragment-only URLs
114 if url.startswith("#"):
115 return allow_fragments
117 # Parse the URL
118 try:
119 parsed = urlparse(url)
120 except Exception as e:
121 logger.warning(f"Failed to parse URL '{url[:100]}': {e}")
122 return False
124 # Check scheme
125 if not parsed.scheme:
126 if require_scheme:
127 return False
128 # If no scheme is required, assume http/https for URL parsing
129 parsed = urlparse(f"http://{url}") # DevSkim: ignore DS137138
131 scheme_lower = parsed.scheme.lower()
133 # Check if it's a mailto link
134 if scheme_lower == URLValidator.EMAIL_SCHEME:
135 return allow_mailto
137 # Check if it's a safe scheme
138 if scheme_lower not in URLValidator.SAFE_SCHEMES:
139 logger.warning(f"Unsafe URL scheme: {scheme_lower}")
140 return False
142 # Validate domain if trusted domains are specified
143 if trusted_domains and parsed.hostname:
144 hostname_lower = parsed.hostname.lower()
145 if not any(
146 hostname_lower == domain.lower()
147 or hostname_lower.endswith(f".{domain.lower()}")
148 for domain in trusted_domains
149 ):
150 logger.warning(
151 f"URL domain not in trusted list: {parsed.hostname}"
152 )
153 return False
155 # Check for suspicious patterns in the URL
156 if URLValidator._has_suspicious_patterns(url):
157 return False
159 return True
161 @staticmethod
162 def _has_suspicious_patterns(url: str) -> bool:
163 """
164 Check for suspicious patterns in URLs that might indicate attacks.
166 Args:
167 url: The URL to check
169 Returns:
170 True if suspicious patterns are found, False otherwise
171 """
172 suspicious_patterns = [
173 # Double encoding
174 r"%25[0-9a-fA-F]{2}",
175 # Null bytes
176 r"%00",
177 # Unicode encoding bypass attempts
178 r"\\u[0-9a-fA-F]{4}",
179 # HTML entity encoding
180 r"&(#x?[0-9a-fA-F]+|[a-zA-Z]+);",
181 ]
183 for pattern in suspicious_patterns:
184 if re.search(pattern, url, re.IGNORECASE):
185 logger.warning(f"Suspicious pattern found in URL: {pattern}")
186 return True
188 return False
190 @staticmethod
191 def sanitize_url(url: str, default_scheme: str = "https") -> Optional[str]:
192 """
193 Sanitize a URL by adding a scheme if missing and validating it.
195 Args:
196 url: The URL to sanitize
197 default_scheme: The default scheme to add if missing
199 Returns:
200 Sanitized URL or None if the URL is unsafe
201 """
202 if not url:
203 return None
205 # Check for unsafe schemes
206 if URLValidator.is_unsafe_scheme(url):
207 return None
209 # Strip whitespace
210 url = url.strip()
212 # Parse the URL
213 try:
214 parsed = urlparse(url)
216 # Add scheme if missing
217 if not parsed.scheme:
218 url = f"{default_scheme}://{url}"
219 parsed = urlparse(url)
221 # Validate the final URL
222 if URLValidator.is_safe_url(url, require_scheme=True): 222 ↛ 228line 222 didn't jump to line 228 because the condition on line 222 was always true
223 return url
225 except Exception as e:
226 logger.warning(f"Failed to sanitize URL '{url[:100]}': {e}")
228 return None
230 @staticmethod
231 def is_academic_url(url: str) -> bool:
232 """
233 Check if a URL is from a known academic/research domain.
235 Args:
236 url: The URL to check
238 Returns:
239 True if the URL is from an academic domain, False otherwise
240 """
241 try:
242 parsed = urlparse(url)
243 if parsed.hostname:
244 hostname_lower = parsed.hostname.lower()
245 return any(
246 hostname_lower == domain
247 or hostname_lower.endswith(f".{domain}")
248 for domain in URLValidator.TRUSTED_ACADEMIC_DOMAINS
249 )
250 except Exception:
251 pass
253 return False
255 @staticmethod
256 def extract_doi(url: str) -> Optional[str]:
257 """
258 Extract DOI from a URL if present.
260 Args:
261 url: The URL to extract DOI from
263 Returns:
264 The DOI if found, None otherwise
265 """
266 # Common DOI patterns with explicit pattern identification
267 doi_patterns = [
268 (
269 r"10\.\d{4,}(?:\.\d+)*\/[-._;()\/:a-zA-Z0-9]+",
270 0,
271 ), # Direct DOI, group 0
272 (r"doi\.org\/(10\.\d{4,}[^\s]*)", 1), # doi.org URL, group 1
273 ]
275 for pattern, group_index in doi_patterns:
276 match = re.search(pattern, url, re.IGNORECASE)
277 if match:
278 return match.group(group_index)
280 return None
282 @staticmethod
283 def validate_http_url(url: str) -> bool:
284 """
285 Validate that a callback URL is well-formed and safe for HTTP/HTTPS use.
287 This is stricter than is_safe_url() and specifically validates HTTP/HTTPS
288 URLs for use as application callbacks (e.g., in notifications, redirects).
289 It does NOT validate Apprise service URLs which use other protocols.
291 Args:
292 url: HTTP/HTTPS callback URL to validate
294 Returns:
295 True if valid
297 Raises:
298 URLValidationError: If URL is invalid
299 """
300 if not url or not isinstance(url, str):
301 raise URLValidationError("URL must be a non-empty string")
303 try:
304 parsed = urlparse(url)
306 # Must have a scheme
307 if not parsed.scheme:
308 raise URLValidationError(
309 "URL must have a scheme (http or https)"
310 )
312 # Must be http or https (callback URLs only)
313 if parsed.scheme not in ("http", "https"):
314 raise URLValidationError(
315 f"URL scheme must be http or https, got: {parsed.scheme}"
316 )
318 # Use the general security validator for additional safety
319 if not URLValidator.is_safe_url(url, require_scheme=True): 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true
320 raise URLValidationError(
321 f"URL failed security validation: {url}"
322 )
324 # Must have a netloc (hostname)
325 if not parsed.netloc:
326 raise URLValidationError("URL must have a hostname")
328 # Check for obvious hostname issues
329 if parsed.netloc.startswith(".") or parsed.netloc.endswith("."):
330 raise URLValidationError(f"Invalid hostname: {parsed.netloc}")
332 # Path should be valid if present
333 if parsed.path and not parsed.path.startswith("/"): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 raise URLValidationError(
335 f"URL path must start with /: {parsed.path}"
336 )
338 return True
340 except Exception as e:
341 if isinstance(e, URLValidationError): 341 ↛ 343line 341 didn't jump to line 343 because the condition on line 341 was always true
342 raise
343 raise URLValidationError(f"Failed to validate URL: {e}")
345 @staticmethod
346 def is_safe_redirect_url(target: str, host_url: str) -> bool:
347 """
348 Validate that a redirect target is safe (same host).
350 Prevents open redirect attacks by ensuring the target URL
351 is either relative or points to the same host as the application.
352 Also prevents CRLF injection attacks by rejecting URLs containing
353 carriage return or line feed characters.
355 Uses the standard Flask pattern from:
356 https://github.com/fengsp/flask-snippets/blob/master/security/redirect_back.py
358 Security protections implemented:
359 - CRLF injection prevention
360 - Protocol-relative URL bypass (//evil.com)
361 - Triple-slash bypass (///evil.com)
362 - URL-encoded bypass attempts
363 - Backslash bypass (treated as forward slash by some browsers)
364 - Path traversal blocking
366 Args:
367 target: The redirect URL to validate (can be relative or absolute)
368 host_url: The application's host URL (e.g., request.host_url)
370 Returns:
371 True if the URL is safe to redirect to, False otherwise
372 """
373 if not target:
374 return False
376 # Prevent CRLF injection by rejecting URLs with CR or LF characters
377 if "\r" in target or "\n" in target:
378 logger.warning("CRLF injection attempt detected in redirect URL")
379 return False
381 # Normalize and decode URL to detect encoded bypass attempts
382 # This catches %2f%2f (encoded //) and similar tricks
383 try:
384 from urllib.parse import unquote
386 decoded_target = unquote(target)
387 except Exception:
388 decoded_target = target
390 # Check for backslash (treated as forward slash by some browsers)
391 # Prevents \evil.com and \\evil.com bypasses
392 if "\\" in decoded_target: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true
393 logger.warning("Backslash detected in redirect URL")
394 return False
396 # Check for protocol-relative URLs before stripping slashes
397 # //evil.com would be treated as absolute URL by urljoin()
398 if decoded_target.startswith("//"): 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 logger.warning("Protocol-relative URL detected in redirect")
400 return False
402 # Block path traversal patterns in redirect URL path
403 # Check both original and decoded to catch single-encoded traversal (%2e%2e)
404 original_path = urlparse(target).path
405 decoded_path = urlparse(decoded_target).path
406 if ".." in original_path or ".." in decoded_path:
407 logger.warning(
408 "Path traversal detected in redirect URL: %s", target
409 )
410 return False
412 ref_url = urlparse(host_url)
413 test_url = urlparse(urljoin(host_url, target))
414 return (
415 test_url.scheme in ("http", "https")
416 and ref_url.netloc == test_url.netloc
417 )
420def get_javascript_url_validator() -> str:
421 """
422 Get JavaScript code for URL validation that matches the Python implementation.
424 Returns:
425 JavaScript code as a string that can be embedded in web pages
426 """
427 return r"""
428 // URL validation utilities matching Python URLValidator
429 const URLValidator = {
430 UNSAFE_SCHEMES: ['javascript', 'data', 'vbscript', 'about', 'blob', 'file'],
431 SAFE_SCHEMES: ['http', 'https', 'ftp', 'ftps'],
432 EMAIL_SCHEME: 'mailto',
434 isUnsafeScheme: function(url) {
435 if (!url) return false;
437 const normalizedUrl = url.trim().toLowerCase();
439 for (const scheme of this.UNSAFE_SCHEMES) {
440 if (normalizedUrl.startsWith(scheme + ':')) {
441 console.warn(`Unsafe URL scheme detected: ${scheme}`);
442 return true;
443 }
444 }
446 return false;
447 },
449 isSafeUrl: function(url, options = {}) {
450 const {
451 requireScheme = true,
452 allowFragments = true,
453 allowMailto = false,
454 trustedDomains = []
455 } = options;
457 if (!url || typeof url !== 'string') {
458 return false;
459 }
461 // Check for unsafe schemes first
462 if (this.isUnsafeScheme(url)) {
463 return false;
464 }
466 // Handle fragment-only URLs
467 if (url.startsWith('#')) {
468 return allowFragments;
469 }
471 // Parse the URL
472 try {
473 const parsed = new URL(url, window.location.href);
474 const scheme = parsed.protocol.slice(0, -1).toLowerCase(); // Remove trailing ':'
476 // Check if it's a mailto link
477 if (scheme === this.EMAIL_SCHEME) {
478 return allowMailto;
479 }
481 // Check if it's a safe scheme
482 if (!this.SAFE_SCHEMES.includes(scheme)) {
483 console.warn(`Unsafe URL scheme: ${scheme}`);
484 return false;
485 }
487 // Validate domain if trusted domains are specified
488 if (trustedDomains.length > 0 && parsed.hostname) {
489 const hostname = parsed.hostname.toLowerCase();
490 const isTrusted = trustedDomains.some(domain =>
491 hostname === domain.toLowerCase() ||
492 hostname.endsWith('.' + domain.toLowerCase())
493 );
495 if (!isTrusted) {
496 console.warn(`URL domain not in trusted list: ${parsed.hostname}`);
497 return false;
498 }
499 }
501 return true;
502 } catch (e) {
503 console.warn(`Failed to parse URL: ${e.message}`);
504 return false;
505 }
506 },
508 sanitizeUrl: function(url, defaultScheme = 'https') {
509 if (!url) return null;
511 // Check for unsafe schemes
512 if (this.isUnsafeScheme(url)) {
513 return null;
514 }
516 // Strip whitespace
517 url = url.trim();
519 // Add scheme if missing
520 if (!url.match(/^[a-zA-Z][a-zA-Z\d+\-.]*:/)) {
521 url = `${defaultScheme}://${url}`;
522 }
524 // Validate the final URL
525 if (this.isSafeUrl(url, { requireScheme: true })) {
526 return url;
527 }
529 return null;
530 }
531 };
532 """