Coverage for src/local_deep_research/text_processing/text

1"""

2Text Cleaning Utilities

4Provides functions for cleaning text with encoding issues.

5"""

7from loguru import logger

10def remove_surrogates(text: str) -> str:

11 """

12 Remove surrogate characters that can't be encoded in UTF-8.

14 PDF extraction and other text extraction methods can produce malformed Unicode

15 with surrogate characters. This function ensures the text is safe for UTF-8 encoding.

17 Args:

18 text: Raw text that may contain surrogate characters

20 Returns:

21 Cleaned text safe for UTF-8 encoding

22 """

23 if not text:

24 return text

26 try:

27 # Encode with surrogatepass to handle surrogates, then decode with replace

28 # to convert any remaining issues to replacement characters

29 return text.encode("utf-8", errors="surrogatepass").decode(

30 "utf-8", errors="replace"

32 except Exception as e:

33 logger.warning(

34 f"Error cleaning text with surrogatepass: {e}, using fallback"

36 # Fallback: ignore any characters that can't be encoded

37 return text.encode("utf-8", errors="ignore").decode(

38 "utf-8", errors="ignore"

Coverage for src / local_deep_research / text_processing / text_cleaner.py: 73%