Coverage for src / local_deep_research / security / safe_requests.py: 98%
150 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Safe HTTP Requests Wrapper
4Wraps requests library to add SSRF protection and security best practices.
5"""
7import requests
8from typing import Any, Optional
9from urllib.parse import urljoin
11from loguru import logger
13from . import ssrf_validator
16# Default timeout for all HTTP requests (prevents hanging)
17DEFAULT_TIMEOUT = 30 # seconds
19# Maximum response size to prevent memory exhaustion (1GB)
20# Set high to accommodate large documents (annual reports, PDFs, datasets).
21# This is a local research tool — users intentionally download these files.
22MAX_RESPONSE_SIZE = 1024 * 1024 * 1024
24# HTTP status codes that indicate a redirect
25_REDIRECT_STATUS_CODES = frozenset({301, 302, 303, 307, 308})
27# Maximum number of redirects to follow
28_MAX_REDIRECTS = 10
31def _install_body_guard(response: requests.Response) -> None:
32 """Install a bounded reader that enforces MAX_RESPONSE_SIZE.
34 Wraps response.raw.read() to track cumulative bytes and raise
35 ValueError if MAX_RESPONSE_SIZE is exceeded during body consumption.
36 This transparently protects both streamed (.iter_content) and
37 non-streamed (.text, .json(), .content) access patterns.
39 Always installs — callers (currently only _check_response_size)
40 are responsible for deciding when to call this function.
41 """
42 original_read = response.raw.read
43 bytes_read = 0
45 def bounded_read(amt=None, *args, **kwargs):
46 nonlocal bytes_read
47 data = original_read(amt, *args, **kwargs)
48 bytes_read += len(data)
49 if bytes_read > MAX_RESPONSE_SIZE:
50 response.close()
51 raise ValueError(
52 f"Response body too large: >{bytes_read} bytes "
53 f"(max {MAX_RESPONSE_SIZE}, Content-Length absent or invalid)"
54 )
55 return data
57 response.raw.read = bounded_read # type: ignore[method-assign]
60def _check_response_size(response: requests.Response) -> None:
61 """Reject responses whose Content-Length exceeds MAX_RESPONSE_SIZE.
63 Handles comma-separated values per RFC 7230 §3.3.2: identical
64 duplicates (from proxies) are normalized; differing values are
65 rejected as invalid framing. Empty parts from malformed headers
66 (trailing/doubled commas) are filtered before parsing. Non-integer
67 or negative values cause the header to be treated as absent.
69 When Content-Length is absent, unparseable, negative, or consists
70 only of commas/whitespace, installs a body guard that enforces
71 the size limit during body consumption.
73 Must be called before returning a response to the caller. On
74 rejection the response is closed to avoid leaking the connection.
76 Raises:
77 ValueError: If Content-Length values conflict or exceed
78 MAX_RESPONSE_SIZE.
79 """
80 content_length = response.headers.get("Content-Length")
81 if content_length:
82 try:
83 # Handle comma-separated Content-Length values (RFC 7230 §3.3.2).
84 # Multiple identical values may be sent by proxies; differing
85 # values indicate invalid framing and must be rejected.
86 raw_parts = [v.strip() for v in content_length.split(",")]
87 parts = [p for p in raw_parts if p]
88 if not parts:
89 _install_body_guard(response)
90 return # Only commas/whitespace — treat as absent
91 sizes = [int(p) for p in parts]
92 except (ValueError, TypeError):
93 _install_body_guard(response)
94 return # Content-Length not a valid number
95 if len(set(sizes)) > 1:
96 response.close()
97 raise ValueError(
98 f"Conflicting Content-Length values: {content_length}"
99 )
100 size = sizes[0]
101 if size < 0:
102 _install_body_guard(response)
103 return # Malformed Content-Length, treat as absent
104 if size > MAX_RESPONSE_SIZE:
105 response.close()
106 raise ValueError(
107 f"Response too large: {size} bytes (max {MAX_RESPONSE_SIZE})"
108 )
109 # Valid Content-Length within limit — no body guard needed
110 return
112 # No Content-Length header at all — install body guard
113 _install_body_guard(response)
116def _resolve_redirect_method(method: str, status_code: int) -> str:
117 """Determine HTTP method after redirect, per RFC 7231."""
118 if status_code == 303 and method != "HEAD":
119 method = "GET"
120 elif status_code == 302 and method == "POST":
121 method = "GET"
122 elif status_code == 301 and method == "POST":
123 method = "GET"
124 # 307, 308: preserve original method (no change needed)
125 return method
128def safe_get(
129 url: str,
130 params: Optional[dict] = None,
131 timeout: int = DEFAULT_TIMEOUT,
132 allow_localhost: bool = False,
133 allow_private_ips: bool = False,
134 **kwargs,
135) -> requests.Response:
136 """
137 Make a safe HTTP GET request with SSRF protection.
139 Args:
140 url: URL to request
141 params: URL parameters
142 timeout: Request timeout in seconds
143 allow_localhost: Whether to allow localhost/loopback addresses.
144 Set to True for trusted internal services like self-hosted
145 search engines (e.g., searxng). Default False.
146 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
147 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x
148 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6
149 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services
150 like SearXNG or Ollama in containerized environments.
151 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked.
152 **kwargs: Additional arguments to pass to requests.get()
154 Returns:
155 Response object
157 Raises:
158 ValueError: If URL fails SSRF validation
159 requests.RequestException: If request fails
160 """
161 # Validate URL to prevent SSRF
162 if not ssrf_validator.validate_url(
163 url,
164 allow_localhost=allow_localhost,
165 allow_private_ips=allow_private_ips,
166 ):
167 raise ValueError(
168 f"URL failed security validation (possible SSRF): {url}"
169 )
171 # Ensure timeout is set
172 if "timeout" not in kwargs: 172 ↛ 177line 172 didn't jump to line 177 because the condition on line 172 was always true
173 kwargs["timeout"] = timeout
175 # Intercept allow_redirects — we handle redirects manually to validate
176 # each redirect target against SSRF rules
177 caller_wants_redirects = kwargs.pop("allow_redirects", True)
178 kwargs["allow_redirects"] = False
180 current_url = url
181 try:
182 response = requests.get(url, params=params, **kwargs)
184 # Follow redirects manually with SSRF validation on each hop.
185 # Each hop uses a fresh requests.get() call without a session,
186 # so cookies set by intermediate responses are not carried
187 # forward. This is acceptable for current callers (all stateless).
188 # Callers needing cookie persistence across redirects should use
189 # SafeSession instead, which preserves cookies via its cookie jar.
190 if caller_wants_redirects:
191 redirects_followed = 0
192 while (
193 response.status_code in _REDIRECT_STATUS_CODES
194 and redirects_followed < _MAX_REDIRECTS
195 ):
196 redirect_url = (response.headers.get("Location") or "").strip()
197 if not redirect_url:
198 break
200 # Resolve relative redirects
201 redirect_url = urljoin(
202 response.url or current_url, redirect_url
203 )
205 # Validate redirect target against SSRF rules
206 if not ssrf_validator.validate_url(
207 redirect_url,
208 allow_localhost=allow_localhost,
209 allow_private_ips=allow_private_ips,
210 ):
211 logger.warning(
212 f"Redirect to {redirect_url} blocked by SSRF validation "
213 f"(from {url}, hop {redirects_followed + 1})"
214 )
215 response.close()
216 raise ValueError(
217 f"Redirect target failed SSRF validation: {redirect_url}"
218 )
220 current_url = redirect_url
221 response.close()
222 # Note: params are intentionally NOT forwarded to redirect
223 # hops. Per HTTP spec, the server's Location header contains
224 # the complete target URL. Re-appending original query params
225 # would corrupt it.
226 response = requests.get(redirect_url, **kwargs)
227 redirects_followed += 1
229 if (
230 response.status_code in _REDIRECT_STATUS_CODES
231 and redirects_followed >= _MAX_REDIRECTS
232 ):
233 response.close()
234 # Note: raises ValueError here, while SafeSession raises
235 # requests.TooManyRedirects (delegated to the base class).
236 # Callers should catch ValueError for standalone functions.
237 raise ValueError(
238 f"Too many redirects ({_MAX_REDIRECTS}) from {url}"
239 )
241 _check_response_size(response)
243 return response
245 except requests.Timeout:
246 logger.warning(f"Request timeout after {timeout}s: {current_url}")
247 raise
248 except requests.RequestException:
249 logger.warning(f"Request failed for {current_url}")
250 raise
253def safe_post(
254 url: str,
255 data: Optional[Any] = None,
256 json: Optional[dict] = None,
257 timeout: int = DEFAULT_TIMEOUT,
258 allow_localhost: bool = False,
259 allow_private_ips: bool = False,
260 **kwargs,
261) -> requests.Response:
262 """
263 Make a safe HTTP POST request with SSRF protection.
265 Args:
266 url: URL to request
267 data: Data to send in request body
268 json: JSON data to send in request body
269 timeout: Request timeout in seconds
270 allow_localhost: Whether to allow localhost/loopback addresses.
271 Set to True for trusted internal services like self-hosted
272 search engines (e.g., searxng). Default False.
273 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
274 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x
275 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6
276 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services
277 like SearXNG or Ollama in containerized environments.
278 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked.
279 **kwargs: Additional arguments to pass to requests.post()
281 Returns:
282 Response object
284 Raises:
285 ValueError: If URL fails SSRF validation
286 requests.RequestException: If request fails
287 """
288 # Validate URL to prevent SSRF
289 if not ssrf_validator.validate_url(
290 url,
291 allow_localhost=allow_localhost,
292 allow_private_ips=allow_private_ips,
293 ):
294 raise ValueError(
295 f"URL failed security validation (possible SSRF): {url}"
296 )
298 # Ensure timeout is set
299 if "timeout" not in kwargs: 299 ↛ 304line 299 didn't jump to line 304 because the condition on line 299 was always true
300 kwargs["timeout"] = timeout
302 # Intercept allow_redirects — we handle redirects manually to validate
303 # each redirect target against SSRF rules
304 caller_wants_redirects = kwargs.pop("allow_redirects", True)
305 kwargs["allow_redirects"] = False
307 current_url = url
308 try:
309 response = requests.post(url, data=data, json=json, **kwargs)
311 # Follow redirects manually with SSRF validation on each hop.
312 # Each hop uses a fresh request without a session, so cookies
313 # set by intermediate responses are not carried forward. Callers
314 # needing cookie persistence should use SafeSession instead.
315 if caller_wants_redirects:
316 redirect_method = "POST"
317 redirects_followed = 0
318 while (
319 response.status_code in _REDIRECT_STATUS_CODES
320 and redirects_followed < _MAX_REDIRECTS
321 ):
322 redirect_url = (response.headers.get("Location") or "").strip()
323 if not redirect_url:
324 break
326 # Resolve relative redirects
327 redirect_url = urljoin(
328 response.url or current_url, redirect_url
329 )
331 # Validate redirect target against SSRF rules
332 if not ssrf_validator.validate_url(
333 redirect_url,
334 allow_localhost=allow_localhost,
335 allow_private_ips=allow_private_ips,
336 ):
337 logger.warning(
338 f"Redirect to {redirect_url} blocked by SSRF validation "
339 f"(from {url}, hop {redirects_followed + 1})"
340 )
341 response.close()
342 raise ValueError(
343 f"Redirect target failed SSRF validation: {redirect_url}"
344 )
346 redirect_method = _resolve_redirect_method(
347 redirect_method, response.status_code
348 )
349 current_url = redirect_url
350 response.close()
352 if redirect_method == "GET":
353 # 301/302/303: convert to GET, drop body
354 data = None
355 json = None
356 response = requests.get(redirect_url, **kwargs)
357 else:
358 # 307/308: preserve current method and body
359 response = requests.post(
360 redirect_url, data=data, json=json, **kwargs
361 )
362 redirects_followed += 1
364 if (
365 response.status_code in _REDIRECT_STATUS_CODES
366 and redirects_followed >= _MAX_REDIRECTS
367 ):
368 response.close()
369 # Note: raises ValueError here, while SafeSession raises
370 # requests.TooManyRedirects (delegated to the base class).
371 # Callers should catch ValueError for standalone functions.
372 raise ValueError(
373 f"Too many redirects ({_MAX_REDIRECTS}) from {url}"
374 )
376 _check_response_size(response)
378 return response
380 except requests.Timeout:
381 logger.warning(f"Request timeout after {timeout}s: {current_url}")
382 raise
383 except requests.RequestException:
384 logger.warning(f"Request failed for {current_url}")
385 raise
388# Create a safe session class
389class SafeSession(requests.Session):
390 """
391 Session with built-in SSRF protection.
393 Redirect validation relies on ``requests.Session.resolve_redirects()``
394 calling ``self.send()`` for each hop — an internal implementation detail
395 of the ``requests`` library. This is simpler than re-implementing the
396 redirect loop (as ``safe_get``/``safe_post`` do) and keeps session-level
397 features (cookies, auth) working. The trade-off is coupling to the
398 ``requests`` internals; if a future version stops routing hops through
399 ``send()``, redirect targets would no longer be validated.
401 Usage:
402 with SafeSession() as session:
403 response = session.get(url)
405 # For trusted internal services (e.g., searxng on localhost):
406 with SafeSession(allow_localhost=True) as session:
407 response = session.get(url)
409 # For trusted internal services on any private network IP:
410 with SafeSession(allow_private_ips=True) as session:
411 response = session.get(url)
413 Raises:
414 ValueError: If a URL (initial or redirect target) fails SSRF
415 validation, or if the response Content-Length exceeds
416 MAX_RESPONSE_SIZE. Note: ``safe_get``/``safe_post`` also raise
417 ``ValueError`` for too-many-redirects, but ``SafeSession`` raises
418 ``requests.TooManyRedirects`` for that case since it delegates
419 redirect counting to the ``requests`` library.
420 requests.RequestException: On transport-level failures.
421 """
423 def __init__(
424 self, allow_localhost: bool = False, allow_private_ips: bool = False
425 ):
426 """
427 Initialize SafeSession.
429 Args:
430 allow_localhost: Whether to allow localhost/loopback addresses.
431 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
432 This includes RFC1918, CGNAT (100.64.x.x used by Podman), link-local, and
433 IPv6 private ranges. Use for trusted self-hosted services like SearXNG or
434 Ollama in containerized environments.
435 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked.
436 """
437 super().__init__()
438 self.max_redirects = _MAX_REDIRECTS
439 self.allow_localhost = allow_localhost
440 self.allow_private_ips = allow_private_ips
442 def request(self, method: str, url: str, **kwargs) -> requests.Response: # type: ignore[override]
443 """Override request method to add SSRF validation."""
444 # Validate URL
445 if not ssrf_validator.validate_url(
446 url,
447 allow_localhost=self.allow_localhost,
448 allow_private_ips=self.allow_private_ips,
449 ):
450 raise ValueError(
451 f"URL failed security validation (possible SSRF): {url}"
452 )
454 # Ensure timeout is set
455 if "timeout" not in kwargs:
456 kwargs["timeout"] = DEFAULT_TIMEOUT
458 return super().request(method, url, **kwargs)
460 def send(
461 self, request: requests.PreparedRequest, **kwargs
462 ) -> requests.Response:
463 """Override send to validate every outgoing request against SSRF.
465 This runs on **all** calls — both the initial request (routed
466 here by ``requests.Session.request()``) and each redirect hop
467 (routed here by ``resolve_redirects()``). The initial URL is
468 therefore validated twice (once in ``request()``, once here);
469 this is intentional defense-in-depth.
470 """
471 if request.url and not ssrf_validator.validate_url(
472 request.url,
473 allow_localhost=self.allow_localhost,
474 allow_private_ips=self.allow_private_ips,
475 ):
476 logger.warning(
477 f"Request to {request.url} blocked by SSRF validation"
478 )
479 # Note: This error says "security validation" while safe_get/
480 # safe_post say "SSRF validation". The difference indicates the
481 # source (session vs standalone function) in logs.
482 raise ValueError(
483 f"Redirect target failed security validation (possible SSRF): {request.url}"
484 )
486 response = super().send(request, **kwargs)
487 _check_response_size(response)
488 return response