Coverage for src/local_deep_research/security/log_sanitizer.py: 100%
19 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Sanitize raw strings before writing them to log output.
3``data_sanitizer.py`` handles dict-key redaction (e.g. stripping API keys
4from structured data by key name). This module handles different
5concerns:
7* :func:`strip_control_chars` / :func:`sanitize_for_log` \u2014 make a single
8 string value safe to include in a log line by removing non-printable
9 characters and truncating to a reasonable length.
10* :func:`redact_secrets` \u2014 scrub known sensitive *values* (API keys,
11 passwords, session tokens) from an arbitrary string before it is
12 logged, returned in an error message, or persisted.
13"""
15import re
16from typing import Optional
19# Strip C0/C1 control characters and dangerous Unicode format characters,
20# but preserve visible Unicode (accented, CJK, emoji, etc.)
21_UNSAFE_CHAR_RE = re.compile(
22 r"[\x00-\x1f\x7f-\x9f" # C0/C1 control chars
23 r"\u061c" # Arabic letter mark
24 r"\u200b-\u200f" # Zero-width chars + LTR/RTL marks
25 r"\u202a-\u202e" # Embedding/override (incl. RLO)
26 r"\u2060-\u2064" # Word joiner + math invisible operators
27 r"\u2066-\u2069" # Isolate chars
28 r"\u206a-\u206f" # Digit shape controls
29 r"\ufeff" # BOM / zero-width no-break space
30 r"]"
31)
33# Default minimum length for a value to be considered a redactable secret.
34# Values shorter than this are skipped because a literal ``str.replace`` on
35# a short string would produce false positives in normal message content
36# (e.g. redacting the 3-char string ``key`` would scrub the word "key"
37# everywhere it appears).
38_MIN_SECRET_LENGTH = 8
40# Replacement token written in place of any redacted secret.
41_REDACTION_TOKEN = "***REDACTED***" # noqa: S105 # gitleaks:allow
44def strip_control_chars(value: str) -> str:
45 """Remove control and format characters from *value*, preserving visible Unicode."""
46 return _UNSAFE_CHAR_RE.sub("", value)
49def sanitize_for_log(value: str, max_length: int = 50) -> str:
50 """Return a log-safe version of *value*.
52 * Control and format characters are stripped; valid Unicode is preserved.
53 * The result is truncated to *max_length* characters.
54 """
55 cleaned = strip_control_chars(value)
56 if len(cleaned) > max_length:
57 cleaned = (
58 cleaned[: max_length - 3] + "..."
59 if max_length > 3
60 else cleaned[:max_length]
61 )
62 return cleaned
65def redact_secrets(
66 message: str,
67 *secrets: Optional[str],
68 min_length: int = _MIN_SECRET_LENGTH,
69 replacement: str = _REDACTION_TOKEN,
70) -> str:
71 """Replace each occurrence of any *secret* in *message* with *replacement*.
73 Use this before writing a string to a log sink, returning it in an
74 error response, or persisting it \u2014 when the string may have been
75 constructed from upstream exception messages, URLs, or other
76 sources that could contain a value the caller already knows is
77 sensitive.
79 Each *secret* is matched as a literal substring (``str.replace``).
80 The function does not normalize encodings: if a secret appears
81 URL-encoded or otherwise transformed in *message*, the transformed
82 form is NOT redacted unless the caller also passes that
83 transformed form.
85 When multiple secrets are passed, they are applied in descending
86 length order so a shorter secret that happens to be a substring of
87 a longer one cannot consume part of the longer match. Example:
88 given secrets ``"abc12345"`` and ``"sk-abc12345"``, the longer one
89 is replaced first.
91 Args:
92 message: The string to scrub. Returned unchanged if falsy.
93 *secrets: Zero or more candidate secret values. ``None`` and
94 values shorter than *min_length* are silently skipped \u2014 the
95 caller is responsible for noticing missing config.
96 min_length: Minimum secret length to redact. Values shorter than
97 this are skipped to avoid corrupting normal message content
98 (a 1- or 2-character secret would match too aggressively).
99 Defaults to 8. Real API keys and session tokens are
100 typically 16+ characters.
101 replacement: String written in place of each redacted secret.
102 Defaults to ``"***REDACTED***"``.
104 Returns:
105 *message* with every occurrence of each qualifying secret
106 replaced.
108 See ``tests/security/test_log_sanitizer.py::TestRedactSecrets`` for
109 worked examples (doctest examples are omitted because the
110 repository's gitleaks rule flags any token-shaped literal in
111 docstrings).
112 """
113 if not message:
114 return message
115 # Longest-first prevents a shorter overlapping secret from
116 # truncating a longer one once the replacement token is in place.
117 ordered = sorted(
118 (s for s in secrets if s and len(s) >= min_length),
119 key=len,
120 reverse=True,
121 )
122 for secret in ordered:
123 message = message.replace(secret, replacement)
124 return message