Coverage for src/local_deep_research/security/log_sanitizer.py: 100%

19 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Sanitize raw strings before writing them to log output. 

2 

3``data_sanitizer.py`` handles dict-key redaction (e.g. stripping API keys 

4from structured data by key name). This module handles different 

5concerns: 

6 

7* :func:`strip_control_chars` / :func:`sanitize_for_log` \u2014 make a single 

8 string value safe to include in a log line by removing non-printable 

9 characters and truncating to a reasonable length. 

10* :func:`redact_secrets` \u2014 scrub known sensitive *values* (API keys, 

11 passwords, session tokens) from an arbitrary string before it is 

12 logged, returned in an error message, or persisted. 

13""" 

14 

15import re 

16from typing import Optional 

17 

18 

19# Strip C0/C1 control characters and dangerous Unicode format characters, 

20# but preserve visible Unicode (accented, CJK, emoji, etc.) 

21_UNSAFE_CHAR_RE = re.compile( 

22 r"[\x00-\x1f\x7f-\x9f" # C0/C1 control chars 

23 r"\u061c" # Arabic letter mark 

24 r"\u200b-\u200f" # Zero-width chars + LTR/RTL marks 

25 r"\u202a-\u202e" # Embedding/override (incl. RLO) 

26 r"\u2060-\u2064" # Word joiner + math invisible operators 

27 r"\u2066-\u2069" # Isolate chars 

28 r"\u206a-\u206f" # Digit shape controls 

29 r"\ufeff" # BOM / zero-width no-break space 

30 r"]" 

31) 

32 

33# Default minimum length for a value to be considered a redactable secret. 

34# Values shorter than this are skipped because a literal ``str.replace`` on 

35# a short string would produce false positives in normal message content 

36# (e.g. redacting the 3-char string ``key`` would scrub the word "key" 

37# everywhere it appears). 

38_MIN_SECRET_LENGTH = 8 

39 

40# Replacement token written in place of any redacted secret. 

41_REDACTION_TOKEN = "***REDACTED***" # noqa: S105 # gitleaks:allow 

42 

43 

44def strip_control_chars(value: str) -> str: 

45 """Remove control and format characters from *value*, preserving visible Unicode.""" 

46 return _UNSAFE_CHAR_RE.sub("", value) 

47 

48 

49def sanitize_for_log(value: str, max_length: int = 50) -> str: 

50 """Return a log-safe version of *value*. 

51 

52 * Control and format characters are stripped; valid Unicode is preserved. 

53 * The result is truncated to *max_length* characters. 

54 """ 

55 cleaned = strip_control_chars(value) 

56 if len(cleaned) > max_length: 

57 cleaned = ( 

58 cleaned[: max_length - 3] + "..." 

59 if max_length > 3 

60 else cleaned[:max_length] 

61 ) 

62 return cleaned 

63 

64 

65def redact_secrets( 

66 message: str, 

67 *secrets: Optional[str], 

68 min_length: int = _MIN_SECRET_LENGTH, 

69 replacement: str = _REDACTION_TOKEN, 

70) -> str: 

71 """Replace each occurrence of any *secret* in *message* with *replacement*. 

72 

73 Use this before writing a string to a log sink, returning it in an 

74 error response, or persisting it \u2014 when the string may have been 

75 constructed from upstream exception messages, URLs, or other 

76 sources that could contain a value the caller already knows is 

77 sensitive. 

78 

79 Each *secret* is matched as a literal substring (``str.replace``). 

80 The function does not normalize encodings: if a secret appears 

81 URL-encoded or otherwise transformed in *message*, the transformed 

82 form is NOT redacted unless the caller also passes that 

83 transformed form. 

84 

85 When multiple secrets are passed, they are applied in descending 

86 length order so a shorter secret that happens to be a substring of 

87 a longer one cannot consume part of the longer match. Example: 

88 given secrets ``"abc12345"`` and ``"sk-abc12345"``, the longer one 

89 is replaced first. 

90 

91 Args: 

92 message: The string to scrub. Returned unchanged if falsy. 

93 *secrets: Zero or more candidate secret values. ``None`` and 

94 values shorter than *min_length* are silently skipped \u2014 the 

95 caller is responsible for noticing missing config. 

96 min_length: Minimum secret length to redact. Values shorter than 

97 this are skipped to avoid corrupting normal message content 

98 (a 1- or 2-character secret would match too aggressively). 

99 Defaults to 8. Real API keys and session tokens are 

100 typically 16+ characters. 

101 replacement: String written in place of each redacted secret. 

102 Defaults to ``"***REDACTED***"``. 

103 

104 Returns: 

105 *message* with every occurrence of each qualifying secret 

106 replaced. 

107 

108 See ``tests/security/test_log_sanitizer.py::TestRedactSecrets`` for 

109 worked examples (doctest examples are omitted because the 

110 repository's gitleaks rule flags any token-shaped literal in 

111 docstrings). 

112 """ 

113 if not message: 

114 return message 

115 # Longest-first prevents a shorter overlapping secret from 

116 # truncating a longer one once the replacement token is in place. 

117 ordered = sorted( 

118 (s for s in secrets if s and len(s) >= min_length), 

119 key=len, 

120 reverse=True, 

121 ) 

122 for secret in ordered: 

123 message = message.replace(secret, replacement) 

124 return message