Coverage for src/local_deep_research/security/safe_requests.py: 98%
223 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Safe HTTP Requests Wrapper
4Wraps requests library to add SSRF protection and security best practices.
5"""
7import datetime
8import email.utils
9import time
10from typing import Any, Optional
11from urllib.parse import urljoin
13import requests
14from loguru import logger
16from . import ssrf_validator
17from ..constants import USER_AGENT
18from ..utilities.resource_utils import safe_close
21# Default timeout for all HTTP requests (prevents hanging)
22DEFAULT_TIMEOUT = 30 # seconds
24# Maximum response size to prevent memory exhaustion (1GB)
25# Set high to accommodate large documents (annual reports, PDFs, datasets).
26# This is a local research tool — users intentionally download these files.
27MAX_RESPONSE_SIZE = 1024 * 1024 * 1024
29# HTTP status codes that indicate a redirect
30_REDIRECT_STATUS_CODES = frozenset({301, 302, 303, 307, 308})
32# Maximum number of redirects to follow
33_MAX_REDIRECTS = 10
35# Prefix of the requests library's default User-Agent
36# (e.g. "python-requests/2.32.5"). Used by `SafeSession.request` to detect
37# whether the session is using the upstream default — if so, we override
38# it with the project `USER_AGENT` so academic API endpoints (arXiv,
39# OpenAlex, PubMed, …) can identify us. Promoting the literal to a
40# constant so a future requests rename only requires a one-line edit.
41_DEFAULT_REQUESTS_UA_PREFIX = "python-requests"
44def _install_body_guard(response: requests.Response) -> None:
45 """Install a bounded reader that enforces MAX_RESPONSE_SIZE.
47 Wraps response.raw.read() to track cumulative bytes and raise
48 ValueError if MAX_RESPONSE_SIZE is exceeded during body consumption.
49 This transparently protects both streamed (.iter_content) and
50 non-streamed (.text, .json(), .content) access patterns.
52 Always installs — callers (currently only _check_response_size)
53 are responsible for deciding when to call this function.
54 """
55 original_read = response.raw.read
56 bytes_read = 0
58 def bounded_read(amt=None, *args, **kwargs):
59 nonlocal bytes_read
60 data = original_read(amt, *args, **kwargs)
61 bytes_read += len(data)
62 if bytes_read > MAX_RESPONSE_SIZE:
63 response.close()
64 raise ValueError(
65 f"Response body too large: >{bytes_read} bytes "
66 f"(max {MAX_RESPONSE_SIZE}, Content-Length absent or invalid)"
67 )
68 return data
70 response.raw.read = bounded_read # type: ignore[method-assign]
73def _check_response_size(response: requests.Response) -> None:
74 """Reject responses whose Content-Length exceeds MAX_RESPONSE_SIZE.
76 Handles comma-separated values per RFC 7230 §3.3.2: identical
77 duplicates (from proxies) are normalized; differing values are
78 rejected as invalid framing. Empty parts from malformed headers
79 (trailing/doubled commas) are filtered before parsing. Non-integer
80 or negative values cause the header to be treated as absent.
82 When Content-Length is absent, unparseable, negative, or consists
83 only of commas/whitespace, installs a body guard that enforces
84 the size limit during body consumption.
86 Must be called before returning a response to the caller. On
87 rejection the response is closed to avoid leaking the connection.
89 Raises:
90 ValueError: If Content-Length values conflict or exceed
91 MAX_RESPONSE_SIZE.
92 """
93 content_length = response.headers.get("Content-Length")
94 if content_length:
95 try:
96 # Handle comma-separated Content-Length values (RFC 7230 §3.3.2).
97 # Multiple identical values may be sent by proxies; differing
98 # values indicate invalid framing and must be rejected.
99 raw_parts = [v.strip() for v in content_length.split(",")]
100 parts = [p for p in raw_parts if p]
101 if not parts:
102 _install_body_guard(response)
103 return # Only commas/whitespace — treat as absent
104 sizes = [int(p) for p in parts]
105 except (ValueError, TypeError):
106 _install_body_guard(response)
107 return # Content-Length not a valid number
108 if len(set(sizes)) > 1:
109 response.close()
110 raise ValueError(
111 f"Conflicting Content-Length values: {content_length}"
112 )
113 size = sizes[0]
114 if size < 0:
115 _install_body_guard(response)
116 return # Malformed Content-Length, treat as absent
117 if size > MAX_RESPONSE_SIZE:
118 response.close()
119 raise ValueError(
120 f"Response too large: {size} bytes (max {MAX_RESPONSE_SIZE})"
121 )
122 # Valid Content-Length within limit — no body guard needed
123 return
125 # No Content-Length header at all — install body guard
126 _install_body_guard(response)
129def _resolve_redirect_method(method: str, status_code: int) -> str:
130 """Determine HTTP method after redirect, per RFC 7231."""
131 if status_code == 303 and method != "HEAD":
132 method = "GET"
133 elif status_code == 302 and method == "POST":
134 method = "GET"
135 elif status_code == 301 and method == "POST":
136 method = "GET"
137 # 307, 308: preserve original method (no change needed)
138 return method
141def safe_get(
142 url: str,
143 params: Optional[dict] = None,
144 timeout: int = DEFAULT_TIMEOUT,
145 allow_localhost: bool = False,
146 allow_private_ips: bool = False,
147 **kwargs,
148) -> requests.Response:
149 """
150 Make a safe HTTP GET request with SSRF protection.
152 Args:
153 url: URL to request
154 params: URL parameters
155 timeout: Request timeout in seconds
156 allow_localhost: Whether to allow localhost/loopback addresses.
157 Set to True for trusted internal services like self-hosted
158 search engines (e.g., searxng). Default False.
159 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
160 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x
161 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6
162 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services
163 like SearXNG or Ollama in containerized environments.
164 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean /
165 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see
166 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``.
167 **kwargs: Additional arguments to pass to requests.get()
169 Returns:
170 Response object
172 Raises:
173 ValueError: If URL fails SSRF validation
174 requests.RequestException: If request fails
175 """
176 # Validate URL to prevent SSRF
177 if not ssrf_validator.validate_url(
178 url,
179 allow_localhost=allow_localhost,
180 allow_private_ips=allow_private_ips,
181 ):
182 raise ValueError(
183 f"URL failed security validation (possible SSRF): {url}"
184 )
186 # Ensure timeout is set
187 if "timeout" not in kwargs: 187 ↛ 193line 187 didn't jump to line 193 because the condition on line 187 was always true
188 kwargs["timeout"] = timeout
190 # Inject the project User-Agent if the caller didn't supply one.
191 # Mutates a copy of any caller-supplied headers dict so we never
192 # touch their object.
193 headers = dict(kwargs.get("headers") or {})
194 if not any(k.lower() == "user-agent" for k in headers):
195 headers["User-Agent"] = USER_AGENT
196 kwargs["headers"] = headers
198 # Intercept allow_redirects — we handle redirects manually to validate
199 # each redirect target against SSRF rules
200 caller_wants_redirects = kwargs.pop("allow_redirects", True)
201 kwargs["allow_redirects"] = False
203 current_url = url
204 try:
205 response = requests.get(url, params=params, **kwargs)
207 # Follow redirects manually with SSRF validation on each hop.
208 # Each hop uses a fresh requests.get() call without a session,
209 # so cookies set by intermediate responses are not carried
210 # forward. This is acceptable for current callers (all stateless).
211 # Callers needing cookie persistence across redirects should use
212 # SafeSession instead, which preserves cookies via its cookie jar.
213 if caller_wants_redirects:
214 redirects_followed = 0
215 while (
216 response.status_code in _REDIRECT_STATUS_CODES
217 and redirects_followed < _MAX_REDIRECTS
218 ):
219 redirect_url = (response.headers.get("Location") or "").strip()
220 if not redirect_url:
221 break
223 # Resolve relative redirects
224 redirect_url = urljoin(
225 response.url or current_url, redirect_url
226 )
228 # Validate redirect target against SSRF rules
229 if not ssrf_validator.validate_url(
230 redirect_url,
231 allow_localhost=allow_localhost,
232 allow_private_ips=allow_private_ips,
233 ):
234 logger.warning(
235 f"Redirect to {redirect_url} blocked by SSRF validation "
236 f"(from {url}, hop {redirects_followed + 1})"
237 )
238 response.close()
239 raise ValueError(
240 f"Redirect target failed SSRF validation: {redirect_url}"
241 )
243 current_url = redirect_url
244 response.close()
245 # Note: params are intentionally NOT forwarded to redirect
246 # hops. Per HTTP spec, the server's Location header contains
247 # the complete target URL. Re-appending original query params
248 # would corrupt it.
249 response = requests.get(redirect_url, **kwargs)
250 redirects_followed += 1
252 if (
253 response.status_code in _REDIRECT_STATUS_CODES
254 and redirects_followed >= _MAX_REDIRECTS
255 ):
256 response.close()
257 # Note: raises ValueError here, while SafeSession raises
258 # requests.TooManyRedirects (delegated to the base class).
259 # Callers should catch ValueError for standalone functions.
260 raise ValueError(
261 f"Too many redirects ({_MAX_REDIRECTS}) from {url}"
262 )
264 _check_response_size(response)
266 return response
268 except requests.Timeout:
269 logger.warning(f"Request timeout after {timeout}s: {current_url}")
270 raise
271 except requests.RequestException:
272 logger.warning(f"Request failed for {current_url}")
273 raise
276def safe_post(
277 url: str,
278 data: Optional[Any] = None,
279 json: Optional[dict] = None,
280 timeout: int = DEFAULT_TIMEOUT,
281 allow_localhost: bool = False,
282 allow_private_ips: bool = False,
283 **kwargs,
284) -> requests.Response:
285 """
286 Make a safe HTTP POST request with SSRF protection.
288 Args:
289 url: URL to request
290 data: Data to send in request body
291 json: JSON data to send in request body
292 timeout: Request timeout in seconds
293 allow_localhost: Whether to allow localhost/loopback addresses.
294 Set to True for trusted internal services like self-hosted
295 search engines (e.g., searxng). Default False.
296 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
297 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x
298 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6
299 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services
300 like SearXNG or Ollama in containerized environments.
301 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean /
302 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see
303 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``.
304 **kwargs: Additional arguments to pass to requests.post()
306 Returns:
307 Response object
309 Raises:
310 ValueError: If URL fails SSRF validation
311 requests.RequestException: If request fails
312 """
313 # Validate URL to prevent SSRF
314 if not ssrf_validator.validate_url(
315 url,
316 allow_localhost=allow_localhost,
317 allow_private_ips=allow_private_ips,
318 ):
319 raise ValueError(
320 f"URL failed security validation (possible SSRF): {url}"
321 )
323 # Ensure timeout is set
324 if "timeout" not in kwargs: 324 ↛ 330line 324 didn't jump to line 330 because the condition on line 324 was always true
325 kwargs["timeout"] = timeout
327 # Inject the project User-Agent if the caller didn't supply one.
328 # Mutates a copy of any caller-supplied headers dict so we never
329 # touch their object.
330 headers = dict(kwargs.get("headers") or {})
331 if not any(k.lower() == "user-agent" for k in headers):
332 headers["User-Agent"] = USER_AGENT
333 kwargs["headers"] = headers
335 # Intercept allow_redirects — we handle redirects manually to validate
336 # each redirect target against SSRF rules
337 caller_wants_redirects = kwargs.pop("allow_redirects", True)
338 kwargs["allow_redirects"] = False
340 current_url = url
341 try:
342 response = requests.post(url, data=data, json=json, **kwargs)
344 # Follow redirects manually with SSRF validation on each hop.
345 # Each hop uses a fresh request without a session, so cookies
346 # set by intermediate responses are not carried forward. Callers
347 # needing cookie persistence should use SafeSession instead.
348 if caller_wants_redirects:
349 redirect_method = "POST"
350 redirects_followed = 0
351 while (
352 response.status_code in _REDIRECT_STATUS_CODES
353 and redirects_followed < _MAX_REDIRECTS
354 ):
355 redirect_url = (response.headers.get("Location") or "").strip()
356 if not redirect_url:
357 break
359 # Resolve relative redirects
360 redirect_url = urljoin(
361 response.url or current_url, redirect_url
362 )
364 # Validate redirect target against SSRF rules
365 if not ssrf_validator.validate_url(
366 redirect_url,
367 allow_localhost=allow_localhost,
368 allow_private_ips=allow_private_ips,
369 ):
370 logger.warning(
371 f"Redirect to {redirect_url} blocked by SSRF validation "
372 f"(from {url}, hop {redirects_followed + 1})"
373 )
374 response.close()
375 raise ValueError(
376 f"Redirect target failed SSRF validation: {redirect_url}"
377 )
379 redirect_method = _resolve_redirect_method(
380 redirect_method, response.status_code
381 )
382 current_url = redirect_url
383 response.close()
385 if redirect_method == "GET":
386 # 301/302/303: convert to GET, drop body
387 data = None
388 json = None
389 response = requests.get(redirect_url, **kwargs)
390 else:
391 # 307/308: preserve current method and body
392 response = requests.post(
393 redirect_url, data=data, json=json, **kwargs
394 )
395 redirects_followed += 1
397 if (
398 response.status_code in _REDIRECT_STATUS_CODES
399 and redirects_followed >= _MAX_REDIRECTS
400 ):
401 response.close()
402 # Note: raises ValueError here, while SafeSession raises
403 # requests.TooManyRedirects (delegated to the base class).
404 # Callers should catch ValueError for standalone functions.
405 raise ValueError(
406 f"Too many redirects ({_MAX_REDIRECTS}) from {url}"
407 )
409 _check_response_size(response)
411 return response
413 except requests.Timeout:
414 logger.warning(f"Request timeout after {timeout}s: {current_url}")
415 raise
416 except requests.RequestException:
417 logger.warning(f"Request failed for {current_url}")
418 raise
421# Create a safe session class
422class SafeSession(requests.Session):
423 """
424 Session with built-in SSRF protection.
426 Redirect validation relies on ``requests.Session.resolve_redirects()``
427 calling ``self.send()`` for each hop — an internal implementation detail
428 of the ``requests`` library. This is simpler than re-implementing the
429 redirect loop (as ``safe_get``/``safe_post`` do) and keeps session-level
430 features (cookies, auth) working. The trade-off is coupling to the
431 ``requests`` internals; if a future version stops routing hops through
432 ``send()``, redirect targets would no longer be validated.
434 Usage:
435 with SafeSession() as session:
436 response = session.get(url)
438 # For trusted internal services (e.g., searxng on localhost):
439 with SafeSession(allow_localhost=True) as session:
440 response = session.get(url)
442 # For trusted internal services on any private network IP:
443 with SafeSession(allow_private_ips=True) as session:
444 response = session.get(url)
446 Raises:
447 ValueError: If a URL (initial or redirect target) fails SSRF
448 validation, or if the response Content-Length exceeds
449 MAX_RESPONSE_SIZE. Note: ``safe_get``/``safe_post`` also raise
450 ``ValueError`` for too-many-redirects, but ``SafeSession`` raises
451 ``requests.TooManyRedirects`` for that case since it delegates
452 redirect counting to the ``requests`` library.
453 requests.RequestException: On transport-level failures.
454 """
456 def __init__(
457 self, allow_localhost: bool = False, allow_private_ips: bool = False
458 ):
459 """
460 Initialize SafeSession.
462 Args:
463 allow_localhost: Whether to allow localhost/loopback addresses.
464 allow_private_ips: Whether to allow all private/internal IPs plus localhost.
465 This includes RFC1918, CGNAT (100.64.x.x used by Podman), link-local, and
466 IPv6 private ranges. Use for trusted self-hosted services like SearXNG or
467 Ollama in containerized environments.
468 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean /
469 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see
470 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``.
471 """
472 super().__init__()
473 self.max_redirects = _MAX_REDIRECTS
474 self.allow_localhost = allow_localhost
475 self.allow_private_ips = allow_private_ips
477 def request(self, method: str, url: str, **kwargs) -> requests.Response: # type: ignore[override]
478 """Override request method to add SSRF validation."""
479 # Validate URL
480 if not ssrf_validator.validate_url(
481 url,
482 allow_localhost=self.allow_localhost,
483 allow_private_ips=self.allow_private_ips,
484 ):
485 raise ValueError(
486 f"URL failed security validation (possible SSRF): {url}"
487 )
489 # Ensure timeout is set
490 if "timeout" not in kwargs:
491 kwargs["timeout"] = DEFAULT_TIMEOUT
493 # Inject project User-Agent if the caller didn't already set one.
494 # Session-level User-Agent (self.headers) is left alone — only
495 # per-request headers are copied so we never mutate caller state.
496 headers = dict(kwargs.get("headers") or {})
497 session_ua = self.headers.get("User-Agent", "")
498 has_per_request_ua = any(k.lower() == "user-agent" for k in headers)
499 if not has_per_request_ua and (
500 not session_ua or session_ua.startswith(_DEFAULT_REQUESTS_UA_PREFIX)
501 ):
502 headers["User-Agent"] = USER_AGENT
503 kwargs["headers"] = headers
505 return super().request(method, url, **kwargs)
507 def send(
508 self, request: requests.PreparedRequest, **kwargs
509 ) -> requests.Response:
510 """Override send to validate every outgoing request against SSRF.
512 This runs on **all** calls — both the initial request (routed
513 here by ``requests.Session.request()``) and each redirect hop
514 (routed here by ``resolve_redirects()``). The initial URL is
515 therefore validated twice (once in ``request()``, once here);
516 this is intentional defense-in-depth.
517 """
518 if request.url and not ssrf_validator.validate_url(
519 request.url,
520 allow_localhost=self.allow_localhost,
521 allow_private_ips=self.allow_private_ips,
522 ):
523 logger.warning(
524 f"Request to {request.url} blocked by SSRF validation"
525 )
526 # Note: This error says "security validation" while safe_get/
527 # safe_post say "SSRF validation". The difference indicates the
528 # source (session vs standalone function) in logs.
529 raise ValueError(
530 f"Redirect target failed security validation (possible SSRF): {request.url}"
531 )
533 response = super().send(request, **kwargs)
534 _check_response_size(response)
535 return response
538# Exponential backoff schedule (seconds). Kept short: journal-quality
539# downloads are run from a user request or a scheduled job, not from a
540# time-sensitive hot path, so three retries over ~7 seconds is plenty
541# without adding real latency.
542_RETRY_BACKOFF_SECONDS = (1, 2, 4)
544# HTTP status codes worth retrying (transient server / rate-limit errors).
545_RETRYABLE_STATUS_CODES = frozenset({429, 500, 502, 503, 504})
547# Upper bound on a honored Retry-After (seconds). RFC 7231 puts no
548# ceiling on the header, so a hostile or misconfigured upstream could
549# pin a worker via an arbitrarily large value. Cap here to bound the
550# damage; legitimate waits (seconds to low minutes) pass through.
551_MAX_RETRY_AFTER_SECONDS = 300
554def _parse_retry_after(retry_after_raw: Optional[str]) -> Optional[int]:
555 """Parse a ``Retry-After`` header value, clamped to ``[0, MAX]``.
557 Returns ``None`` if the header is missing or unparseable, so the
558 caller can fall back to the exponential-backoff schedule. Accepts
559 both RFC 7231 forms: delay-seconds (integer) and HTTP-date.
560 """
561 if retry_after_raw is None:
562 return None
563 try:
564 seconds = int(retry_after_raw)
565 except ValueError:
566 try:
567 retry_dt = email.utils.parsedate_to_datetime(retry_after_raw)
568 except (ValueError, TypeError):
569 logger.debug(
570 f"Unparseable Retry-After {retry_after_raw!r}; "
571 f"using backoff schedule"
572 )
573 return None
574 now_utc = datetime.datetime.now(datetime.timezone.utc)
575 seconds = int((retry_dt - now_utc).total_seconds())
576 return max(0, min(seconds, _MAX_RETRY_AFTER_SECONDS))
579def safe_get_with_retries(
580 url: str,
581 params: Optional[dict] = None,
582 timeout: int = DEFAULT_TIMEOUT,
583 allow_localhost: bool = False,
584 allow_private_ips: bool = False,
585 max_retries: int = 3,
586 backoff_times: tuple = _RETRY_BACKOFF_SECONDS,
587 consume_body: bool = False,
588 **kwargs,
589) -> requests.Response:
590 """`safe_get` plus exponential-backoff retry on transient errors.
592 Retries on:
593 * ``requests.ConnectionError``
594 * ``requests.Timeout``
595 * HTTP ``429`` (rate limit) and ``5xx`` (server error)
596 * (when ``consume_body=True``) body-read failures —
597 ``ChunkedEncodingError``, ``ReadTimeout``, mid-stream
598 ``ConnectionError``
600 Honors the ``Retry-After`` header when present (falls back to the
601 backoff schedule otherwise). SSRF-validation errors (``ValueError``)
602 and non-retryable HTTP 4xx responses are not retried.
604 Without ``consume_body``, only failures raised inside ``safe_get``
605 itself (DNS, connect, header timeout, retryable status) trigger a
606 retry. The body isn't read until the caller touches ``.content`` /
607 ``.text`` / ``.json()``, by which point this wrapper has already
608 returned — so a mid-stream S3 hiccup (``ChunkedEncodingError``)
609 propagates uncaught. ``consume_body=True`` reads the body inside
610 the retry loop so those transient body-read failures are also
611 retried. The cached body is still available to the caller via
612 ``response.content`` after the wrapper returns.
614 Args:
615 url: Target URL.
616 params: Query parameters.
617 timeout: Per-attempt socket timeout.
618 allow_localhost: Forwarded to ``safe_get``.
619 allow_private_ips: Forwarded to ``safe_get``.
620 max_retries: Maximum retry attempts after the initial try.
621 backoff_times: Per-attempt sleep seconds.
622 consume_body: If True, read ``response.content`` inside the
623 retry loop so body-read transients are retried. Use for
624 large or chunk-transferred bodies (~MB+) where mid-stream
625 disconnects are realistic. The body-guard's ``ValueError``
626 (oversized body) is NOT retried — it propagates immediately.
627 **kwargs: Forwarded to ``safe_get``.
629 Returns:
630 The first successful (or final-attempt) ``requests.Response``.
631 When ``consume_body=True``, the body has already been read and
632 is cached on the response.
634 Raises:
635 ValueError: If SSRF validation fails or, with
636 ``consume_body=True``, the body-guard rejects an oversized
637 response. Retries do not help in either case.
638 requests.RequestException: If every attempt fails.
639 """
640 attempt = 0
641 while True:
642 try:
643 response = safe_get(
644 url,
645 params=params,
646 timeout=timeout,
647 allow_localhost=allow_localhost,
648 allow_private_ips=allow_private_ips,
649 **kwargs,
650 )
651 except (requests.ConnectionError, requests.Timeout) as exc:
652 if attempt >= max_retries:
653 raise
654 wait = backoff_times[min(attempt, len(backoff_times) - 1)]
655 logger.warning(
656 f"{exc.__class__.__name__} on {url}; "
657 f"retrying in {wait}s "
658 f"(attempt {attempt + 1}/{max_retries})"
659 )
660 time.sleep(wait)
661 attempt += 1
662 continue
664 if response.status_code in _RETRYABLE_STATUS_CODES:
665 if attempt >= max_retries: 665 ↛ 666line 665 didn't jump to line 666 because the condition on line 665 was never true
666 return response
667 parsed = _parse_retry_after(response.headers.get("Retry-After"))
668 wait = (
669 parsed
670 if parsed is not None
671 else backoff_times[min(attempt, len(backoff_times) - 1)]
672 )
673 logger.warning(
674 f"HTTP {response.status_code} on {url}; "
675 f"retrying in {wait}s "
676 f"(attempt {attempt + 1}/{max_retries})"
677 )
678 response.close()
679 time.sleep(wait)
680 attempt += 1
681 continue
683 if consume_body:
684 try:
685 # Force body read while still inside the retry loop.
686 # ChunkedEncodingError / ReadTimeout / mid-stream
687 # ConnectionError can fire here on large responses
688 # from flaky upstreams. ReadTimeout is a Timeout
689 # subclass but NOT a ConnectionError subclass, so the
690 # except must list both ConnectionError and Timeout
691 # — listing Timeout alone would miss ConnectError, and
692 # listing ConnectionError alone would miss ReadTimeout.
693 _ = response.content
694 except (
695 requests.exceptions.ChunkedEncodingError,
696 requests.exceptions.ConnectionError,
697 requests.exceptions.Timeout,
698 ) as exc:
699 # safe_close instead of bare close: a close() that
700 # raises here would mask the original body-read error
701 # we actually want to surface / retry on.
702 safe_close(response, "response")
703 if attempt >= max_retries:
704 raise
705 wait = backoff_times[min(attempt, len(backoff_times) - 1)]
706 logger.warning(
707 f"{exc.__class__.__name__} reading body of {url}; "
708 f"retrying in {wait}s "
709 f"(attempt {attempt + 1}/{max_retries})"
710 )
711 time.sleep(wait)
712 attempt += 1
713 continue
715 return response