Coverage for src/local_deep_research/security/safe_requests.py: 98%

223 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Safe HTTP Requests Wrapper 

3 

4Wraps requests library to add SSRF protection and security best practices. 

5""" 

6 

7import datetime 

8import email.utils 

9import time 

10from typing import Any, Optional 

11from urllib.parse import urljoin 

12 

13import requests 

14from loguru import logger 

15 

16from . import ssrf_validator 

17from ..constants import USER_AGENT 

18from ..utilities.resource_utils import safe_close 

19 

20 

21# Default timeout for all HTTP requests (prevents hanging) 

22DEFAULT_TIMEOUT = 30 # seconds 

23 

24# Maximum response size to prevent memory exhaustion (1GB) 

25# Set high to accommodate large documents (annual reports, PDFs, datasets). 

26# This is a local research tool — users intentionally download these files. 

27MAX_RESPONSE_SIZE = 1024 * 1024 * 1024 

28 

29# HTTP status codes that indicate a redirect 

30_REDIRECT_STATUS_CODES = frozenset({301, 302, 303, 307, 308}) 

31 

32# Maximum number of redirects to follow 

33_MAX_REDIRECTS = 10 

34 

35# Prefix of the requests library's default User-Agent 

36# (e.g. "python-requests/2.32.5"). Used by `SafeSession.request` to detect 

37# whether the session is using the upstream default — if so, we override 

38# it with the project `USER_AGENT` so academic API endpoints (arXiv, 

39# OpenAlex, PubMed, …) can identify us. Promoting the literal to a 

40# constant so a future requests rename only requires a one-line edit. 

41_DEFAULT_REQUESTS_UA_PREFIX = "python-requests" 

42 

43 

44def _install_body_guard(response: requests.Response) -> None: 

45 """Install a bounded reader that enforces MAX_RESPONSE_SIZE. 

46 

47 Wraps response.raw.read() to track cumulative bytes and raise 

48 ValueError if MAX_RESPONSE_SIZE is exceeded during body consumption. 

49 This transparently protects both streamed (.iter_content) and 

50 non-streamed (.text, .json(), .content) access patterns. 

51 

52 Always installs — callers (currently only _check_response_size) 

53 are responsible for deciding when to call this function. 

54 """ 

55 original_read = response.raw.read 

56 bytes_read = 0 

57 

58 def bounded_read(amt=None, *args, **kwargs): 

59 nonlocal bytes_read 

60 data = original_read(amt, *args, **kwargs) 

61 bytes_read += len(data) 

62 if bytes_read > MAX_RESPONSE_SIZE: 

63 response.close() 

64 raise ValueError( 

65 f"Response body too large: >{bytes_read} bytes " 

66 f"(max {MAX_RESPONSE_SIZE}, Content-Length absent or invalid)" 

67 ) 

68 return data 

69 

70 response.raw.read = bounded_read # type: ignore[method-assign] 

71 

72 

73def _check_response_size(response: requests.Response) -> None: 

74 """Reject responses whose Content-Length exceeds MAX_RESPONSE_SIZE. 

75 

76 Handles comma-separated values per RFC 7230 §3.3.2: identical 

77 duplicates (from proxies) are normalized; differing values are 

78 rejected as invalid framing. Empty parts from malformed headers 

79 (trailing/doubled commas) are filtered before parsing. Non-integer 

80 or negative values cause the header to be treated as absent. 

81 

82 When Content-Length is absent, unparseable, negative, or consists 

83 only of commas/whitespace, installs a body guard that enforces 

84 the size limit during body consumption. 

85 

86 Must be called before returning a response to the caller. On 

87 rejection the response is closed to avoid leaking the connection. 

88 

89 Raises: 

90 ValueError: If Content-Length values conflict or exceed 

91 MAX_RESPONSE_SIZE. 

92 """ 

93 content_length = response.headers.get("Content-Length") 

94 if content_length: 

95 try: 

96 # Handle comma-separated Content-Length values (RFC 7230 §3.3.2). 

97 # Multiple identical values may be sent by proxies; differing 

98 # values indicate invalid framing and must be rejected. 

99 raw_parts = [v.strip() for v in content_length.split(",")] 

100 parts = [p for p in raw_parts if p] 

101 if not parts: 

102 _install_body_guard(response) 

103 return # Only commas/whitespace — treat as absent 

104 sizes = [int(p) for p in parts] 

105 except (ValueError, TypeError): 

106 _install_body_guard(response) 

107 return # Content-Length not a valid number 

108 if len(set(sizes)) > 1: 

109 response.close() 

110 raise ValueError( 

111 f"Conflicting Content-Length values: {content_length}" 

112 ) 

113 size = sizes[0] 

114 if size < 0: 

115 _install_body_guard(response) 

116 return # Malformed Content-Length, treat as absent 

117 if size > MAX_RESPONSE_SIZE: 

118 response.close() 

119 raise ValueError( 

120 f"Response too large: {size} bytes (max {MAX_RESPONSE_SIZE})" 

121 ) 

122 # Valid Content-Length within limit — no body guard needed 

123 return 

124 

125 # No Content-Length header at all — install body guard 

126 _install_body_guard(response) 

127 

128 

129def _resolve_redirect_method(method: str, status_code: int) -> str: 

130 """Determine HTTP method after redirect, per RFC 7231.""" 

131 if status_code == 303 and method != "HEAD": 

132 method = "GET" 

133 elif status_code == 302 and method == "POST": 

134 method = "GET" 

135 elif status_code == 301 and method == "POST": 

136 method = "GET" 

137 # 307, 308: preserve original method (no change needed) 

138 return method 

139 

140 

141def safe_get( 

142 url: str, 

143 params: Optional[dict] = None, 

144 timeout: int = DEFAULT_TIMEOUT, 

145 allow_localhost: bool = False, 

146 allow_private_ips: bool = False, 

147 **kwargs, 

148) -> requests.Response: 

149 """ 

150 Make a safe HTTP GET request with SSRF protection. 

151 

152 Args: 

153 url: URL to request 

154 params: URL parameters 

155 timeout: Request timeout in seconds 

156 allow_localhost: Whether to allow localhost/loopback addresses. 

157 Set to True for trusted internal services like self-hosted 

158 search engines (e.g., searxng). Default False. 

159 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

160 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x 

161 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6 

162 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services 

163 like SearXNG or Ollama in containerized environments. 

164 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean / 

165 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see 

166 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``. 

167 **kwargs: Additional arguments to pass to requests.get() 

168 

169 Returns: 

170 Response object 

171 

172 Raises: 

173 ValueError: If URL fails SSRF validation 

174 requests.RequestException: If request fails 

175 """ 

176 # Validate URL to prevent SSRF 

177 if not ssrf_validator.validate_url( 

178 url, 

179 allow_localhost=allow_localhost, 

180 allow_private_ips=allow_private_ips, 

181 ): 

182 raise ValueError( 

183 f"URL failed security validation (possible SSRF): {url}" 

184 ) 

185 

186 # Ensure timeout is set 

187 if "timeout" not in kwargs: 187 ↛ 193line 187 didn't jump to line 193 because the condition on line 187 was always true

188 kwargs["timeout"] = timeout 

189 

190 # Inject the project User-Agent if the caller didn't supply one. 

191 # Mutates a copy of any caller-supplied headers dict so we never 

192 # touch their object. 

193 headers = dict(kwargs.get("headers") or {}) 

194 if not any(k.lower() == "user-agent" for k in headers): 

195 headers["User-Agent"] = USER_AGENT 

196 kwargs["headers"] = headers 

197 

198 # Intercept allow_redirects — we handle redirects manually to validate 

199 # each redirect target against SSRF rules 

200 caller_wants_redirects = kwargs.pop("allow_redirects", True) 

201 kwargs["allow_redirects"] = False 

202 

203 current_url = url 

204 try: 

205 response = requests.get(url, params=params, **kwargs) 

206 

207 # Follow redirects manually with SSRF validation on each hop. 

208 # Each hop uses a fresh requests.get() call without a session, 

209 # so cookies set by intermediate responses are not carried 

210 # forward. This is acceptable for current callers (all stateless). 

211 # Callers needing cookie persistence across redirects should use 

212 # SafeSession instead, which preserves cookies via its cookie jar. 

213 if caller_wants_redirects: 

214 redirects_followed = 0 

215 while ( 

216 response.status_code in _REDIRECT_STATUS_CODES 

217 and redirects_followed < _MAX_REDIRECTS 

218 ): 

219 redirect_url = (response.headers.get("Location") or "").strip() 

220 if not redirect_url: 

221 break 

222 

223 # Resolve relative redirects 

224 redirect_url = urljoin( 

225 response.url or current_url, redirect_url 

226 ) 

227 

228 # Validate redirect target against SSRF rules 

229 if not ssrf_validator.validate_url( 

230 redirect_url, 

231 allow_localhost=allow_localhost, 

232 allow_private_ips=allow_private_ips, 

233 ): 

234 logger.warning( 

235 f"Redirect to {redirect_url} blocked by SSRF validation " 

236 f"(from {url}, hop {redirects_followed + 1})" 

237 ) 

238 response.close() 

239 raise ValueError( 

240 f"Redirect target failed SSRF validation: {redirect_url}" 

241 ) 

242 

243 current_url = redirect_url 

244 response.close() 

245 # Note: params are intentionally NOT forwarded to redirect 

246 # hops. Per HTTP spec, the server's Location header contains 

247 # the complete target URL. Re-appending original query params 

248 # would corrupt it. 

249 response = requests.get(redirect_url, **kwargs) 

250 redirects_followed += 1 

251 

252 if ( 

253 response.status_code in _REDIRECT_STATUS_CODES 

254 and redirects_followed >= _MAX_REDIRECTS 

255 ): 

256 response.close() 

257 # Note: raises ValueError here, while SafeSession raises 

258 # requests.TooManyRedirects (delegated to the base class). 

259 # Callers should catch ValueError for standalone functions. 

260 raise ValueError( 

261 f"Too many redirects ({_MAX_REDIRECTS}) from {url}" 

262 ) 

263 

264 _check_response_size(response) 

265 

266 return response 

267 

268 except requests.Timeout: 

269 logger.warning(f"Request timeout after {timeout}s: {current_url}") 

270 raise 

271 except requests.RequestException: 

272 logger.warning(f"Request failed for {current_url}") 

273 raise 

274 

275 

276def safe_post( 

277 url: str, 

278 data: Optional[Any] = None, 

279 json: Optional[dict] = None, 

280 timeout: int = DEFAULT_TIMEOUT, 

281 allow_localhost: bool = False, 

282 allow_private_ips: bool = False, 

283 **kwargs, 

284) -> requests.Response: 

285 """ 

286 Make a safe HTTP POST request with SSRF protection. 

287 

288 Args: 

289 url: URL to request 

290 data: Data to send in request body 

291 json: JSON data to send in request body 

292 timeout: Request timeout in seconds 

293 allow_localhost: Whether to allow localhost/loopback addresses. 

294 Set to True for trusted internal services like self-hosted 

295 search engines (e.g., searxng). Default False. 

296 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

297 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x 

298 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6 

299 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services 

300 like SearXNG or Ollama in containerized environments. 

301 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean / 

302 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see 

303 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``. 

304 **kwargs: Additional arguments to pass to requests.post() 

305 

306 Returns: 

307 Response object 

308 

309 Raises: 

310 ValueError: If URL fails SSRF validation 

311 requests.RequestException: If request fails 

312 """ 

313 # Validate URL to prevent SSRF 

314 if not ssrf_validator.validate_url( 

315 url, 

316 allow_localhost=allow_localhost, 

317 allow_private_ips=allow_private_ips, 

318 ): 

319 raise ValueError( 

320 f"URL failed security validation (possible SSRF): {url}" 

321 ) 

322 

323 # Ensure timeout is set 

324 if "timeout" not in kwargs: 324 ↛ 330line 324 didn't jump to line 330 because the condition on line 324 was always true

325 kwargs["timeout"] = timeout 

326 

327 # Inject the project User-Agent if the caller didn't supply one. 

328 # Mutates a copy of any caller-supplied headers dict so we never 

329 # touch their object. 

330 headers = dict(kwargs.get("headers") or {}) 

331 if not any(k.lower() == "user-agent" for k in headers): 

332 headers["User-Agent"] = USER_AGENT 

333 kwargs["headers"] = headers 

334 

335 # Intercept allow_redirects — we handle redirects manually to validate 

336 # each redirect target against SSRF rules 

337 caller_wants_redirects = kwargs.pop("allow_redirects", True) 

338 kwargs["allow_redirects"] = False 

339 

340 current_url = url 

341 try: 

342 response = requests.post(url, data=data, json=json, **kwargs) 

343 

344 # Follow redirects manually with SSRF validation on each hop. 

345 # Each hop uses a fresh request without a session, so cookies 

346 # set by intermediate responses are not carried forward. Callers 

347 # needing cookie persistence should use SafeSession instead. 

348 if caller_wants_redirects: 

349 redirect_method = "POST" 

350 redirects_followed = 0 

351 while ( 

352 response.status_code in _REDIRECT_STATUS_CODES 

353 and redirects_followed < _MAX_REDIRECTS 

354 ): 

355 redirect_url = (response.headers.get("Location") or "").strip() 

356 if not redirect_url: 

357 break 

358 

359 # Resolve relative redirects 

360 redirect_url = urljoin( 

361 response.url or current_url, redirect_url 

362 ) 

363 

364 # Validate redirect target against SSRF rules 

365 if not ssrf_validator.validate_url( 

366 redirect_url, 

367 allow_localhost=allow_localhost, 

368 allow_private_ips=allow_private_ips, 

369 ): 

370 logger.warning( 

371 f"Redirect to {redirect_url} blocked by SSRF validation " 

372 f"(from {url}, hop {redirects_followed + 1})" 

373 ) 

374 response.close() 

375 raise ValueError( 

376 f"Redirect target failed SSRF validation: {redirect_url}" 

377 ) 

378 

379 redirect_method = _resolve_redirect_method( 

380 redirect_method, response.status_code 

381 ) 

382 current_url = redirect_url 

383 response.close() 

384 

385 if redirect_method == "GET": 

386 # 301/302/303: convert to GET, drop body 

387 data = None 

388 json = None 

389 response = requests.get(redirect_url, **kwargs) 

390 else: 

391 # 307/308: preserve current method and body 

392 response = requests.post( 

393 redirect_url, data=data, json=json, **kwargs 

394 ) 

395 redirects_followed += 1 

396 

397 if ( 

398 response.status_code in _REDIRECT_STATUS_CODES 

399 and redirects_followed >= _MAX_REDIRECTS 

400 ): 

401 response.close() 

402 # Note: raises ValueError here, while SafeSession raises 

403 # requests.TooManyRedirects (delegated to the base class). 

404 # Callers should catch ValueError for standalone functions. 

405 raise ValueError( 

406 f"Too many redirects ({_MAX_REDIRECTS}) from {url}" 

407 ) 

408 

409 _check_response_size(response) 

410 

411 return response 

412 

413 except requests.Timeout: 

414 logger.warning(f"Request timeout after {timeout}s: {current_url}") 

415 raise 

416 except requests.RequestException: 

417 logger.warning(f"Request failed for {current_url}") 

418 raise 

419 

420 

421# Create a safe session class 

422class SafeSession(requests.Session): 

423 """ 

424 Session with built-in SSRF protection. 

425 

426 Redirect validation relies on ``requests.Session.resolve_redirects()`` 

427 calling ``self.send()`` for each hop — an internal implementation detail 

428 of the ``requests`` library. This is simpler than re-implementing the 

429 redirect loop (as ``safe_get``/``safe_post`` do) and keeps session-level 

430 features (cookies, auth) working. The trade-off is coupling to the 

431 ``requests`` internals; if a future version stops routing hops through 

432 ``send()``, redirect targets would no longer be validated. 

433 

434 Usage: 

435 with SafeSession() as session: 

436 response = session.get(url) 

437 

438 # For trusted internal services (e.g., searxng on localhost): 

439 with SafeSession(allow_localhost=True) as session: 

440 response = session.get(url) 

441 

442 # For trusted internal services on any private network IP: 

443 with SafeSession(allow_private_ips=True) as session: 

444 response = session.get(url) 

445 

446 Raises: 

447 ValueError: If a URL (initial or redirect target) fails SSRF 

448 validation, or if the response Content-Length exceeds 

449 MAX_RESPONSE_SIZE. Note: ``safe_get``/``safe_post`` also raise 

450 ``ValueError`` for too-many-redirects, but ``SafeSession`` raises 

451 ``requests.TooManyRedirects`` for that case since it delegates 

452 redirect counting to the ``requests`` library. 

453 requests.RequestException: On transport-level failures. 

454 """ 

455 

456 def __init__( 

457 self, allow_localhost: bool = False, allow_private_ips: bool = False 

458 ): 

459 """ 

460 Initialize SafeSession. 

461 

462 Args: 

463 allow_localhost: Whether to allow localhost/loopback addresses. 

464 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

465 This includes RFC1918, CGNAT (100.64.x.x used by Podman), link-local, and 

466 IPv6 private ranges. Use for trusted self-hosted services like SearXNG or 

467 Ollama in containerized environments. 

468 Note: cloud metadata endpoints (AWS / Azure / OCI / DigitalOcean / 

469 AlibabaCloud / Tencent / ECS) are ALWAYS blocked — see 

470 ``ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS``. 

471 """ 

472 super().__init__() 

473 self.max_redirects = _MAX_REDIRECTS 

474 self.allow_localhost = allow_localhost 

475 self.allow_private_ips = allow_private_ips 

476 

477 def request(self, method: str, url: str, **kwargs) -> requests.Response: # type: ignore[override] 

478 """Override request method to add SSRF validation.""" 

479 # Validate URL 

480 if not ssrf_validator.validate_url( 

481 url, 

482 allow_localhost=self.allow_localhost, 

483 allow_private_ips=self.allow_private_ips, 

484 ): 

485 raise ValueError( 

486 f"URL failed security validation (possible SSRF): {url}" 

487 ) 

488 

489 # Ensure timeout is set 

490 if "timeout" not in kwargs: 

491 kwargs["timeout"] = DEFAULT_TIMEOUT 

492 

493 # Inject project User-Agent if the caller didn't already set one. 

494 # Session-level User-Agent (self.headers) is left alone — only 

495 # per-request headers are copied so we never mutate caller state. 

496 headers = dict(kwargs.get("headers") or {}) 

497 session_ua = self.headers.get("User-Agent", "") 

498 has_per_request_ua = any(k.lower() == "user-agent" for k in headers) 

499 if not has_per_request_ua and ( 

500 not session_ua or session_ua.startswith(_DEFAULT_REQUESTS_UA_PREFIX) 

501 ): 

502 headers["User-Agent"] = USER_AGENT 

503 kwargs["headers"] = headers 

504 

505 return super().request(method, url, **kwargs) 

506 

507 def send( 

508 self, request: requests.PreparedRequest, **kwargs 

509 ) -> requests.Response: 

510 """Override send to validate every outgoing request against SSRF. 

511 

512 This runs on **all** calls — both the initial request (routed 

513 here by ``requests.Session.request()``) and each redirect hop 

514 (routed here by ``resolve_redirects()``). The initial URL is 

515 therefore validated twice (once in ``request()``, once here); 

516 this is intentional defense-in-depth. 

517 """ 

518 if request.url and not ssrf_validator.validate_url( 

519 request.url, 

520 allow_localhost=self.allow_localhost, 

521 allow_private_ips=self.allow_private_ips, 

522 ): 

523 logger.warning( 

524 f"Request to {request.url} blocked by SSRF validation" 

525 ) 

526 # Note: This error says "security validation" while safe_get/ 

527 # safe_post say "SSRF validation". The difference indicates the 

528 # source (session vs standalone function) in logs. 

529 raise ValueError( 

530 f"Redirect target failed security validation (possible SSRF): {request.url}" 

531 ) 

532 

533 response = super().send(request, **kwargs) 

534 _check_response_size(response) 

535 return response 

536 

537 

538# Exponential backoff schedule (seconds). Kept short: journal-quality 

539# downloads are run from a user request or a scheduled job, not from a 

540# time-sensitive hot path, so three retries over ~7 seconds is plenty 

541# without adding real latency. 

542_RETRY_BACKOFF_SECONDS = (1, 2, 4) 

543 

544# HTTP status codes worth retrying (transient server / rate-limit errors). 

545_RETRYABLE_STATUS_CODES = frozenset({429, 500, 502, 503, 504}) 

546 

547# Upper bound on a honored Retry-After (seconds). RFC 7231 puts no 

548# ceiling on the header, so a hostile or misconfigured upstream could 

549# pin a worker via an arbitrarily large value. Cap here to bound the 

550# damage; legitimate waits (seconds to low minutes) pass through. 

551_MAX_RETRY_AFTER_SECONDS = 300 

552 

553 

554def _parse_retry_after(retry_after_raw: Optional[str]) -> Optional[int]: 

555 """Parse a ``Retry-After`` header value, clamped to ``[0, MAX]``. 

556 

557 Returns ``None`` if the header is missing or unparseable, so the 

558 caller can fall back to the exponential-backoff schedule. Accepts 

559 both RFC 7231 forms: delay-seconds (integer) and HTTP-date. 

560 """ 

561 if retry_after_raw is None: 

562 return None 

563 try: 

564 seconds = int(retry_after_raw) 

565 except ValueError: 

566 try: 

567 retry_dt = email.utils.parsedate_to_datetime(retry_after_raw) 

568 except (ValueError, TypeError): 

569 logger.debug( 

570 f"Unparseable Retry-After {retry_after_raw!r}; " 

571 f"using backoff schedule" 

572 ) 

573 return None 

574 now_utc = datetime.datetime.now(datetime.timezone.utc) 

575 seconds = int((retry_dt - now_utc).total_seconds()) 

576 return max(0, min(seconds, _MAX_RETRY_AFTER_SECONDS)) 

577 

578 

579def safe_get_with_retries( 

580 url: str, 

581 params: Optional[dict] = None, 

582 timeout: int = DEFAULT_TIMEOUT, 

583 allow_localhost: bool = False, 

584 allow_private_ips: bool = False, 

585 max_retries: int = 3, 

586 backoff_times: tuple = _RETRY_BACKOFF_SECONDS, 

587 consume_body: bool = False, 

588 **kwargs, 

589) -> requests.Response: 

590 """`safe_get` plus exponential-backoff retry on transient errors. 

591 

592 Retries on: 

593 * ``requests.ConnectionError`` 

594 * ``requests.Timeout`` 

595 * HTTP ``429`` (rate limit) and ``5xx`` (server error) 

596 * (when ``consume_body=True``) body-read failures — 

597 ``ChunkedEncodingError``, ``ReadTimeout``, mid-stream 

598 ``ConnectionError`` 

599 

600 Honors the ``Retry-After`` header when present (falls back to the 

601 backoff schedule otherwise). SSRF-validation errors (``ValueError``) 

602 and non-retryable HTTP 4xx responses are not retried. 

603 

604 Without ``consume_body``, only failures raised inside ``safe_get`` 

605 itself (DNS, connect, header timeout, retryable status) trigger a 

606 retry. The body isn't read until the caller touches ``.content`` / 

607 ``.text`` / ``.json()``, by which point this wrapper has already 

608 returned — so a mid-stream S3 hiccup (``ChunkedEncodingError``) 

609 propagates uncaught. ``consume_body=True`` reads the body inside 

610 the retry loop so those transient body-read failures are also 

611 retried. The cached body is still available to the caller via 

612 ``response.content`` after the wrapper returns. 

613 

614 Args: 

615 url: Target URL. 

616 params: Query parameters. 

617 timeout: Per-attempt socket timeout. 

618 allow_localhost: Forwarded to ``safe_get``. 

619 allow_private_ips: Forwarded to ``safe_get``. 

620 max_retries: Maximum retry attempts after the initial try. 

621 backoff_times: Per-attempt sleep seconds. 

622 consume_body: If True, read ``response.content`` inside the 

623 retry loop so body-read transients are retried. Use for 

624 large or chunk-transferred bodies (~MB+) where mid-stream 

625 disconnects are realistic. The body-guard's ``ValueError`` 

626 (oversized body) is NOT retried — it propagates immediately. 

627 **kwargs: Forwarded to ``safe_get``. 

628 

629 Returns: 

630 The first successful (or final-attempt) ``requests.Response``. 

631 When ``consume_body=True``, the body has already been read and 

632 is cached on the response. 

633 

634 Raises: 

635 ValueError: If SSRF validation fails or, with 

636 ``consume_body=True``, the body-guard rejects an oversized 

637 response. Retries do not help in either case. 

638 requests.RequestException: If every attempt fails. 

639 """ 

640 attempt = 0 

641 while True: 

642 try: 

643 response = safe_get( 

644 url, 

645 params=params, 

646 timeout=timeout, 

647 allow_localhost=allow_localhost, 

648 allow_private_ips=allow_private_ips, 

649 **kwargs, 

650 ) 

651 except (requests.ConnectionError, requests.Timeout) as exc: 

652 if attempt >= max_retries: 

653 raise 

654 wait = backoff_times[min(attempt, len(backoff_times) - 1)] 

655 logger.warning( 

656 f"{exc.__class__.__name__} on {url}; " 

657 f"retrying in {wait}s " 

658 f"(attempt {attempt + 1}/{max_retries})" 

659 ) 

660 time.sleep(wait) 

661 attempt += 1 

662 continue 

663 

664 if response.status_code in _RETRYABLE_STATUS_CODES: 

665 if attempt >= max_retries: 665 ↛ 666line 665 didn't jump to line 666 because the condition on line 665 was never true

666 return response 

667 parsed = _parse_retry_after(response.headers.get("Retry-After")) 

668 wait = ( 

669 parsed 

670 if parsed is not None 

671 else backoff_times[min(attempt, len(backoff_times) - 1)] 

672 ) 

673 logger.warning( 

674 f"HTTP {response.status_code} on {url}; " 

675 f"retrying in {wait}s " 

676 f"(attempt {attempt + 1}/{max_retries})" 

677 ) 

678 response.close() 

679 time.sleep(wait) 

680 attempt += 1 

681 continue 

682 

683 if consume_body: 

684 try: 

685 # Force body read while still inside the retry loop. 

686 # ChunkedEncodingError / ReadTimeout / mid-stream 

687 # ConnectionError can fire here on large responses 

688 # from flaky upstreams. ReadTimeout is a Timeout 

689 # subclass but NOT a ConnectionError subclass, so the 

690 # except must list both ConnectionError and Timeout 

691 # — listing Timeout alone would miss ConnectError, and 

692 # listing ConnectionError alone would miss ReadTimeout. 

693 _ = response.content 

694 except ( 

695 requests.exceptions.ChunkedEncodingError, 

696 requests.exceptions.ConnectionError, 

697 requests.exceptions.Timeout, 

698 ) as exc: 

699 # safe_close instead of bare close: a close() that 

700 # raises here would mask the original body-read error 

701 # we actually want to surface / retry on. 

702 safe_close(response, "response") 

703 if attempt >= max_retries: 

704 raise 

705 wait = backoff_times[min(attempt, len(backoff_times) - 1)] 

706 logger.warning( 

707 f"{exc.__class__.__name__} reading body of {url}; " 

708 f"retrying in {wait}s " 

709 f"(attempt {attempt + 1}/{max_retries})" 

710 ) 

711 time.sleep(wait) 

712 attempt += 1 

713 continue 

714 

715 return response