Coverage for src / local_deep_research / text_processing / text_cleaner.py: 73%
9 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Text Cleaning Utilities
4Provides functions for cleaning text with encoding issues.
5"""
7from loguru import logger
10def remove_surrogates(text: str) -> str:
11 """
12 Remove surrogate characters that can't be encoded in UTF-8.
14 PDF extraction and other text extraction methods can produce malformed Unicode
15 with surrogate characters. This function ensures the text is safe for UTF-8 encoding.
17 Args:
18 text: Raw text that may contain surrogate characters
20 Returns:
21 Cleaned text safe for UTF-8 encoding
22 """
23 if not text:
24 return text
26 try:
27 # Encode with surrogatepass to handle surrogates, then decode with replace
28 # to convert any remaining issues to replacement characters
29 return text.encode("utf-8", errors="surrogatepass").decode(
30 "utf-8", errors="replace"
31 )
32 except Exception as e:
33 logger.warning(
34 f"Error cleaning text with surrogatepass: {e}, using fallback"
35 )
36 # Fallback: ignore any characters that can't be encoded
37 return text.encode("utf-8", errors="ignore").decode(
38 "utf-8", errors="ignore"
39 )