Coverage for src / local_deep_research / security / safe_requests.py: 98%

150 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Safe HTTP Requests Wrapper 

3 

4Wraps requests library to add SSRF protection and security best practices. 

5""" 

6 

7import requests 

8from typing import Any, Optional 

9from urllib.parse import urljoin 

10 

11from loguru import logger 

12 

13from . import ssrf_validator 

14 

15 

16# Default timeout for all HTTP requests (prevents hanging) 

17DEFAULT_TIMEOUT = 30 # seconds 

18 

19# Maximum response size to prevent memory exhaustion (1GB) 

20# Set high to accommodate large documents (annual reports, PDFs, datasets). 

21# This is a local research tool — users intentionally download these files. 

22MAX_RESPONSE_SIZE = 1024 * 1024 * 1024 

23 

24# HTTP status codes that indicate a redirect 

25_REDIRECT_STATUS_CODES = frozenset({301, 302, 303, 307, 308}) 

26 

27# Maximum number of redirects to follow 

28_MAX_REDIRECTS = 10 

29 

30 

31def _install_body_guard(response: requests.Response) -> None: 

32 """Install a bounded reader that enforces MAX_RESPONSE_SIZE. 

33 

34 Wraps response.raw.read() to track cumulative bytes and raise 

35 ValueError if MAX_RESPONSE_SIZE is exceeded during body consumption. 

36 This transparently protects both streamed (.iter_content) and 

37 non-streamed (.text, .json(), .content) access patterns. 

38 

39 Always installs — callers (currently only _check_response_size) 

40 are responsible for deciding when to call this function. 

41 """ 

42 original_read = response.raw.read 

43 bytes_read = 0 

44 

45 def bounded_read(amt=None, *args, **kwargs): 

46 nonlocal bytes_read 

47 data = original_read(amt, *args, **kwargs) 

48 bytes_read += len(data) 

49 if bytes_read > MAX_RESPONSE_SIZE: 

50 response.close() 

51 raise ValueError( 

52 f"Response body too large: >{bytes_read} bytes " 

53 f"(max {MAX_RESPONSE_SIZE}, Content-Length absent or invalid)" 

54 ) 

55 return data 

56 

57 response.raw.read = bounded_read # type: ignore[method-assign] 

58 

59 

60def _check_response_size(response: requests.Response) -> None: 

61 """Reject responses whose Content-Length exceeds MAX_RESPONSE_SIZE. 

62 

63 Handles comma-separated values per RFC 7230 §3.3.2: identical 

64 duplicates (from proxies) are normalized; differing values are 

65 rejected as invalid framing. Empty parts from malformed headers 

66 (trailing/doubled commas) are filtered before parsing. Non-integer 

67 or negative values cause the header to be treated as absent. 

68 

69 When Content-Length is absent, unparseable, negative, or consists 

70 only of commas/whitespace, installs a body guard that enforces 

71 the size limit during body consumption. 

72 

73 Must be called before returning a response to the caller. On 

74 rejection the response is closed to avoid leaking the connection. 

75 

76 Raises: 

77 ValueError: If Content-Length values conflict or exceed 

78 MAX_RESPONSE_SIZE. 

79 """ 

80 content_length = response.headers.get("Content-Length") 

81 if content_length: 

82 try: 

83 # Handle comma-separated Content-Length values (RFC 7230 §3.3.2). 

84 # Multiple identical values may be sent by proxies; differing 

85 # values indicate invalid framing and must be rejected. 

86 raw_parts = [v.strip() for v in content_length.split(",")] 

87 parts = [p for p in raw_parts if p] 

88 if not parts: 

89 _install_body_guard(response) 

90 return # Only commas/whitespace — treat as absent 

91 sizes = [int(p) for p in parts] 

92 except (ValueError, TypeError): 

93 _install_body_guard(response) 

94 return # Content-Length not a valid number 

95 if len(set(sizes)) > 1: 

96 response.close() 

97 raise ValueError( 

98 f"Conflicting Content-Length values: {content_length}" 

99 ) 

100 size = sizes[0] 

101 if size < 0: 

102 _install_body_guard(response) 

103 return # Malformed Content-Length, treat as absent 

104 if size > MAX_RESPONSE_SIZE: 

105 response.close() 

106 raise ValueError( 

107 f"Response too large: {size} bytes (max {MAX_RESPONSE_SIZE})" 

108 ) 

109 # Valid Content-Length within limit — no body guard needed 

110 return 

111 

112 # No Content-Length header at all — install body guard 

113 _install_body_guard(response) 

114 

115 

116def _resolve_redirect_method(method: str, status_code: int) -> str: 

117 """Determine HTTP method after redirect, per RFC 7231.""" 

118 if status_code == 303 and method != "HEAD": 

119 method = "GET" 

120 elif status_code == 302 and method == "POST": 

121 method = "GET" 

122 elif status_code == 301 and method == "POST": 

123 method = "GET" 

124 # 307, 308: preserve original method (no change needed) 

125 return method 

126 

127 

128def safe_get( 

129 url: str, 

130 params: Optional[dict] = None, 

131 timeout: int = DEFAULT_TIMEOUT, 

132 allow_localhost: bool = False, 

133 allow_private_ips: bool = False, 

134 **kwargs, 

135) -> requests.Response: 

136 """ 

137 Make a safe HTTP GET request with SSRF protection. 

138 

139 Args: 

140 url: URL to request 

141 params: URL parameters 

142 timeout: Request timeout in seconds 

143 allow_localhost: Whether to allow localhost/loopback addresses. 

144 Set to True for trusted internal services like self-hosted 

145 search engines (e.g., searxng). Default False. 

146 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

147 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x 

148 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6 

149 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services 

150 like SearXNG or Ollama in containerized environments. 

151 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked. 

152 **kwargs: Additional arguments to pass to requests.get() 

153 

154 Returns: 

155 Response object 

156 

157 Raises: 

158 ValueError: If URL fails SSRF validation 

159 requests.RequestException: If request fails 

160 """ 

161 # Validate URL to prevent SSRF 

162 if not ssrf_validator.validate_url( 

163 url, 

164 allow_localhost=allow_localhost, 

165 allow_private_ips=allow_private_ips, 

166 ): 

167 raise ValueError( 

168 f"URL failed security validation (possible SSRF): {url}" 

169 ) 

170 

171 # Ensure timeout is set 

172 if "timeout" not in kwargs: 172 ↛ 177line 172 didn't jump to line 177 because the condition on line 172 was always true

173 kwargs["timeout"] = timeout 

174 

175 # Intercept allow_redirects — we handle redirects manually to validate 

176 # each redirect target against SSRF rules 

177 caller_wants_redirects = kwargs.pop("allow_redirects", True) 

178 kwargs["allow_redirects"] = False 

179 

180 current_url = url 

181 try: 

182 response = requests.get(url, params=params, **kwargs) 

183 

184 # Follow redirects manually with SSRF validation on each hop. 

185 # Each hop uses a fresh requests.get() call without a session, 

186 # so cookies set by intermediate responses are not carried 

187 # forward. This is acceptable for current callers (all stateless). 

188 # Callers needing cookie persistence across redirects should use 

189 # SafeSession instead, which preserves cookies via its cookie jar. 

190 if caller_wants_redirects: 

191 redirects_followed = 0 

192 while ( 

193 response.status_code in _REDIRECT_STATUS_CODES 

194 and redirects_followed < _MAX_REDIRECTS 

195 ): 

196 redirect_url = (response.headers.get("Location") or "").strip() 

197 if not redirect_url: 

198 break 

199 

200 # Resolve relative redirects 

201 redirect_url = urljoin( 

202 response.url or current_url, redirect_url 

203 ) 

204 

205 # Validate redirect target against SSRF rules 

206 if not ssrf_validator.validate_url( 

207 redirect_url, 

208 allow_localhost=allow_localhost, 

209 allow_private_ips=allow_private_ips, 

210 ): 

211 logger.warning( 

212 f"Redirect to {redirect_url} blocked by SSRF validation " 

213 f"(from {url}, hop {redirects_followed + 1})" 

214 ) 

215 response.close() 

216 raise ValueError( 

217 f"Redirect target failed SSRF validation: {redirect_url}" 

218 ) 

219 

220 current_url = redirect_url 

221 response.close() 

222 # Note: params are intentionally NOT forwarded to redirect 

223 # hops. Per HTTP spec, the server's Location header contains 

224 # the complete target URL. Re-appending original query params 

225 # would corrupt it. 

226 response = requests.get(redirect_url, **kwargs) 

227 redirects_followed += 1 

228 

229 if ( 

230 response.status_code in _REDIRECT_STATUS_CODES 

231 and redirects_followed >= _MAX_REDIRECTS 

232 ): 

233 response.close() 

234 # Note: raises ValueError here, while SafeSession raises 

235 # requests.TooManyRedirects (delegated to the base class). 

236 # Callers should catch ValueError for standalone functions. 

237 raise ValueError( 

238 f"Too many redirects ({_MAX_REDIRECTS}) from {url}" 

239 ) 

240 

241 _check_response_size(response) 

242 

243 return response 

244 

245 except requests.Timeout: 

246 logger.warning(f"Request timeout after {timeout}s: {current_url}") 

247 raise 

248 except requests.RequestException: 

249 logger.warning(f"Request failed for {current_url}") 

250 raise 

251 

252 

253def safe_post( 

254 url: str, 

255 data: Optional[Any] = None, 

256 json: Optional[dict] = None, 

257 timeout: int = DEFAULT_TIMEOUT, 

258 allow_localhost: bool = False, 

259 allow_private_ips: bool = False, 

260 **kwargs, 

261) -> requests.Response: 

262 """ 

263 Make a safe HTTP POST request with SSRF protection. 

264 

265 Args: 

266 url: URL to request 

267 data: Data to send in request body 

268 json: JSON data to send in request body 

269 timeout: Request timeout in seconds 

270 allow_localhost: Whether to allow localhost/loopback addresses. 

271 Set to True for trusted internal services like self-hosted 

272 search engines (e.g., searxng). Default False. 

273 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

274 This includes RFC1918 (10.x, 172.16-31.x, 192.168.x), CGNAT (100.64.x.x 

275 used by Podman/rootless containers), link-local (169.254.x.x), and IPv6 

276 private ranges (fc00::/7, fe80::/10). Use for trusted self-hosted services 

277 like SearXNG or Ollama in containerized environments. 

278 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked. 

279 **kwargs: Additional arguments to pass to requests.post() 

280 

281 Returns: 

282 Response object 

283 

284 Raises: 

285 ValueError: If URL fails SSRF validation 

286 requests.RequestException: If request fails 

287 """ 

288 # Validate URL to prevent SSRF 

289 if not ssrf_validator.validate_url( 

290 url, 

291 allow_localhost=allow_localhost, 

292 allow_private_ips=allow_private_ips, 

293 ): 

294 raise ValueError( 

295 f"URL failed security validation (possible SSRF): {url}" 

296 ) 

297 

298 # Ensure timeout is set 

299 if "timeout" not in kwargs: 299 ↛ 304line 299 didn't jump to line 304 because the condition on line 299 was always true

300 kwargs["timeout"] = timeout 

301 

302 # Intercept allow_redirects — we handle redirects manually to validate 

303 # each redirect target against SSRF rules 

304 caller_wants_redirects = kwargs.pop("allow_redirects", True) 

305 kwargs["allow_redirects"] = False 

306 

307 current_url = url 

308 try: 

309 response = requests.post(url, data=data, json=json, **kwargs) 

310 

311 # Follow redirects manually with SSRF validation on each hop. 

312 # Each hop uses a fresh request without a session, so cookies 

313 # set by intermediate responses are not carried forward. Callers 

314 # needing cookie persistence should use SafeSession instead. 

315 if caller_wants_redirects: 

316 redirect_method = "POST" 

317 redirects_followed = 0 

318 while ( 

319 response.status_code in _REDIRECT_STATUS_CODES 

320 and redirects_followed < _MAX_REDIRECTS 

321 ): 

322 redirect_url = (response.headers.get("Location") or "").strip() 

323 if not redirect_url: 

324 break 

325 

326 # Resolve relative redirects 

327 redirect_url = urljoin( 

328 response.url or current_url, redirect_url 

329 ) 

330 

331 # Validate redirect target against SSRF rules 

332 if not ssrf_validator.validate_url( 

333 redirect_url, 

334 allow_localhost=allow_localhost, 

335 allow_private_ips=allow_private_ips, 

336 ): 

337 logger.warning( 

338 f"Redirect to {redirect_url} blocked by SSRF validation " 

339 f"(from {url}, hop {redirects_followed + 1})" 

340 ) 

341 response.close() 

342 raise ValueError( 

343 f"Redirect target failed SSRF validation: {redirect_url}" 

344 ) 

345 

346 redirect_method = _resolve_redirect_method( 

347 redirect_method, response.status_code 

348 ) 

349 current_url = redirect_url 

350 response.close() 

351 

352 if redirect_method == "GET": 

353 # 301/302/303: convert to GET, drop body 

354 data = None 

355 json = None 

356 response = requests.get(redirect_url, **kwargs) 

357 else: 

358 # 307/308: preserve current method and body 

359 response = requests.post( 

360 redirect_url, data=data, json=json, **kwargs 

361 ) 

362 redirects_followed += 1 

363 

364 if ( 

365 response.status_code in _REDIRECT_STATUS_CODES 

366 and redirects_followed >= _MAX_REDIRECTS 

367 ): 

368 response.close() 

369 # Note: raises ValueError here, while SafeSession raises 

370 # requests.TooManyRedirects (delegated to the base class). 

371 # Callers should catch ValueError for standalone functions. 

372 raise ValueError( 

373 f"Too many redirects ({_MAX_REDIRECTS}) from {url}" 

374 ) 

375 

376 _check_response_size(response) 

377 

378 return response 

379 

380 except requests.Timeout: 

381 logger.warning(f"Request timeout after {timeout}s: {current_url}") 

382 raise 

383 except requests.RequestException: 

384 logger.warning(f"Request failed for {current_url}") 

385 raise 

386 

387 

388# Create a safe session class 

389class SafeSession(requests.Session): 

390 """ 

391 Session with built-in SSRF protection. 

392 

393 Redirect validation relies on ``requests.Session.resolve_redirects()`` 

394 calling ``self.send()`` for each hop — an internal implementation detail 

395 of the ``requests`` library. This is simpler than re-implementing the 

396 redirect loop (as ``safe_get``/``safe_post`` do) and keeps session-level 

397 features (cookies, auth) working. The trade-off is coupling to the 

398 ``requests`` internals; if a future version stops routing hops through 

399 ``send()``, redirect targets would no longer be validated. 

400 

401 Usage: 

402 with SafeSession() as session: 

403 response = session.get(url) 

404 

405 # For trusted internal services (e.g., searxng on localhost): 

406 with SafeSession(allow_localhost=True) as session: 

407 response = session.get(url) 

408 

409 # For trusted internal services on any private network IP: 

410 with SafeSession(allow_private_ips=True) as session: 

411 response = session.get(url) 

412 

413 Raises: 

414 ValueError: If a URL (initial or redirect target) fails SSRF 

415 validation, or if the response Content-Length exceeds 

416 MAX_RESPONSE_SIZE. Note: ``safe_get``/``safe_post`` also raise 

417 ``ValueError`` for too-many-redirects, but ``SafeSession`` raises 

418 ``requests.TooManyRedirects`` for that case since it delegates 

419 redirect counting to the ``requests`` library. 

420 requests.RequestException: On transport-level failures. 

421 """ 

422 

423 def __init__( 

424 self, allow_localhost: bool = False, allow_private_ips: bool = False 

425 ): 

426 """ 

427 Initialize SafeSession. 

428 

429 Args: 

430 allow_localhost: Whether to allow localhost/loopback addresses. 

431 allow_private_ips: Whether to allow all private/internal IPs plus localhost. 

432 This includes RFC1918, CGNAT (100.64.x.x used by Podman), link-local, and 

433 IPv6 private ranges. Use for trusted self-hosted services like SearXNG or 

434 Ollama in containerized environments. 

435 Note: AWS metadata endpoint (169.254.169.254) is ALWAYS blocked. 

436 """ 

437 super().__init__() 

438 self.max_redirects = _MAX_REDIRECTS 

439 self.allow_localhost = allow_localhost 

440 self.allow_private_ips = allow_private_ips 

441 

442 def request(self, method: str, url: str, **kwargs) -> requests.Response: # type: ignore[override] 

443 """Override request method to add SSRF validation.""" 

444 # Validate URL 

445 if not ssrf_validator.validate_url( 

446 url, 

447 allow_localhost=self.allow_localhost, 

448 allow_private_ips=self.allow_private_ips, 

449 ): 

450 raise ValueError( 

451 f"URL failed security validation (possible SSRF): {url}" 

452 ) 

453 

454 # Ensure timeout is set 

455 if "timeout" not in kwargs: 

456 kwargs["timeout"] = DEFAULT_TIMEOUT 

457 

458 return super().request(method, url, **kwargs) 

459 

460 def send( 

461 self, request: requests.PreparedRequest, **kwargs 

462 ) -> requests.Response: 

463 """Override send to validate every outgoing request against SSRF. 

464 

465 This runs on **all** calls — both the initial request (routed 

466 here by ``requests.Session.request()``) and each redirect hop 

467 (routed here by ``resolve_redirects()``). The initial URL is 

468 therefore validated twice (once in ``request()``, once here); 

469 this is intentional defense-in-depth. 

470 """ 

471 if request.url and not ssrf_validator.validate_url( 

472 request.url, 

473 allow_localhost=self.allow_localhost, 

474 allow_private_ips=self.allow_private_ips, 

475 ): 

476 logger.warning( 

477 f"Request to {request.url} blocked by SSRF validation" 

478 ) 

479 # Note: This error says "security validation" while safe_get/ 

480 # safe_post say "SSRF validation". The difference indicates the 

481 # source (session vs standalone function) in logs. 

482 raise ValueError( 

483 f"Redirect target failed security validation (possible SSRF): {request.url}" 

484 ) 

485 

486 response = super().send(request, **kwargs) 

487 _check_response_size(response) 

488 return response