Coverage for src / local_deep_research / security / log_sanitizer.py: 100%

9 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Sanitize raw strings before writing them to log output. 

2 

3``data_sanitizer.py`` handles dict-key redaction (e.g. stripping API keys 

4from structured data). This module handles a different concern: making a 

5single *string value* safe to include in a log line by removing 

6non-printable characters and truncating to a reasonable length. 

7""" 

8 

9import re 

10 

11# Strip C0/C1 control characters and dangerous Unicode format characters, 

12# but preserve visible Unicode (accented, CJK, emoji, etc.) 

13_UNSAFE_CHAR_RE = re.compile( 

14 r"[\x00-\x1f\x7f-\x9f" # C0/C1 control chars 

15 r"\u061c" # Arabic letter mark 

16 r"\u200b-\u200f" # Zero-width chars + LTR/RTL marks 

17 r"\u202a-\u202e" # Embedding/override (incl. RLO) 

18 r"\u2060-\u2064" # Word joiner + math invisible operators 

19 r"\u2066-\u2069" # Isolate chars 

20 r"\u206a-\u206f" # Digit shape controls 

21 r"\ufeff" # BOM / zero-width no-break space 

22 r"]" 

23) 

24 

25 

26def strip_control_chars(value: str) -> str: 

27 """Remove control and format characters from *value*, preserving visible Unicode.""" 

28 return _UNSAFE_CHAR_RE.sub("", value) 

29 

30 

31def sanitize_for_log(value: str, max_length: int = 50) -> str: 

32 """Return a log-safe version of *value*. 

33 

34 * Control and format characters are stripped; valid Unicode is preserved. 

35 * The result is truncated to *max_length* characters. 

36 """ 

37 cleaned = strip_control_chars(value) 

38 if len(cleaned) > max_length: 

39 cleaned = ( 

40 cleaned[: max_length - 3] + "..." 

41 if max_length > 3 

42 else cleaned[:max_length] 

43 ) 

44 return cleaned