Coverage for src / local_deep_research / security / log_sanitizer.py: 100%
9 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Sanitize raw strings before writing them to log output.
3``data_sanitizer.py`` handles dict-key redaction (e.g. stripping API keys
4from structured data). This module handles a different concern: making a
5single *string value* safe to include in a log line by removing
6non-printable characters and truncating to a reasonable length.
7"""
9import re
11# Strip C0/C1 control characters and dangerous Unicode format characters,
12# but preserve visible Unicode (accented, CJK, emoji, etc.)
13_UNSAFE_CHAR_RE = re.compile(
14 r"[\x00-\x1f\x7f-\x9f" # C0/C1 control chars
15 r"\u061c" # Arabic letter mark
16 r"\u200b-\u200f" # Zero-width chars + LTR/RTL marks
17 r"\u202a-\u202e" # Embedding/override (incl. RLO)
18 r"\u2060-\u2064" # Word joiner + math invisible operators
19 r"\u2066-\u2069" # Isolate chars
20 r"\u206a-\u206f" # Digit shape controls
21 r"\ufeff" # BOM / zero-width no-break space
22 r"]"
23)
26def strip_control_chars(value: str) -> str:
27 """Remove control and format characters from *value*, preserving visible Unicode."""
28 return _UNSAFE_CHAR_RE.sub("", value)
31def sanitize_for_log(value: str, max_length: int = 50) -> str:
32 """Return a log-safe version of *value*.
34 * Control and format characters are stripped; valid Unicode is preserved.
35 * The result is truncated to *max_length* characters.
36 """
37 cleaned = strip_control_chars(value)
38 if len(cleaned) > max_length:
39 cleaned = (
40 cleaned[: max_length - 3] + "..."
41 if max_length > 3
42 else cleaned[:max_length]
43 )
44 return cleaned