Coverage for src / local_deep_research / text_processing / text_cleaner.py: 73%

9 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Text Cleaning Utilities 

3 

4Provides functions for cleaning text with encoding issues. 

5""" 

6 

7from loguru import logger 

8 

9 

10def remove_surrogates(text: str) -> str: 

11 """ 

12 Remove surrogate characters that can't be encoded in UTF-8. 

13 

14 PDF extraction and other text extraction methods can produce malformed Unicode 

15 with surrogate characters. This function ensures the text is safe for UTF-8 encoding. 

16 

17 Args: 

18 text: Raw text that may contain surrogate characters 

19 

20 Returns: 

21 Cleaned text safe for UTF-8 encoding 

22 """ 

23 if not text: 

24 return text 

25 

26 try: 

27 # Encode with surrogatepass to handle surrogates, then decode with replace 

28 # to convert any remaining issues to replacement characters 

29 return text.encode("utf-8", errors="surrogatepass").decode( 

30 "utf-8", errors="replace" 

31 ) 

32 except Exception as e: 

33 logger.warning( 

34 f"Error cleaning text with surrogatepass: {e}, using fallback" 

35 ) 

36 # Fallback: ignore any characters that can't be encoded 

37 return text.encode("utf-8", errors="ignore").decode( 

38 "utf-8", errors="ignore" 

39 )