Coverage for src / local_deep_research / benchmarks / datasets / utils.py: 87%

63 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Utility functions for dataset handling. 

3 

4This module provides utility functions for common dataset operations like 

5decryption, encoding detection, etc. 

6""" 

7 

8import base64 

9import hashlib 

10from typing import Dict 

11 

12from loguru import logger 

13 

14 

15def derive_key(password: str, length: int) -> bytes: 

16 """Derive a fixed-length key from the password using SHA256.""" 

17 hasher = hashlib.sha256() 

18 hasher.update(password.encode()) 

19 key = hasher.digest() 

20 return key * (length // len(key)) + key[: length % len(key)] 

21 

22 

23def decrypt(ciphertext_b64: str, password: str) -> str: 

24 """ 

25 Decrypt base64-encoded ciphertext with XOR. 

26 Uses multiple approaches to handle different encoding formats. 

27 """ 

28 # Skip decryption for non-encoded strings 

29 if not isinstance(ciphertext_b64, str) or len(ciphertext_b64) < 8: 

30 return ciphertext_b64 

31 

32 # Skip if the string doesn't look like base64 

33 if not all( 

34 c 

35 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" # pragma: allowlist secret 

36 for c in ciphertext_b64 

37 ): 

38 return ciphertext_b64 

39 

40 # Attempt standard decryption 

41 try: 

42 encrypted = base64.b64decode(ciphertext_b64) 

43 key = derive_key(password, len(encrypted)) 

44 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False)) 

45 

46 # Check if the result looks like valid text 

47 result = decrypted.decode("utf-8", errors="replace") 

48 

49 # Heuristic check - if the decrypted text is mostly ASCII and contains spaces 

50 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]: 

51 logger.debug( 

52 f"Successfully decrypted with standard method: {result[:50]}..." 

53 ) 

54 return result 

55 except Exception as e: 

56 logger.debug(f"Standard decryption failed: {e!s}") 

57 

58 # Alternative method - try using just the first part of the password 

59 try: 

60 if len(password) > 30: 

61 alt_password = password.split()[0] # Use first word 

62 encrypted = base64.b64decode(ciphertext_b64) 

63 key = derive_key(alt_password, len(encrypted)) 

64 decrypted = bytes( 

65 a ^ b for a, b in zip(encrypted, key, strict=False) 

66 ) 

67 

68 result = decrypted.decode("utf-8", errors="replace") 

69 if ( 69 ↛ 84line 69 didn't jump to line 84 because the condition on line 69 was always true

70 all(32 <= ord(c) < 127 for c in result[:50]) 

71 and " " in result[:50] 

72 ): 

73 logger.debug( 

74 f"Successfully decrypted with alternate method 1: {result[:50]}..." 

75 ) 

76 return result 

77 except Exception: 

78 logger.debug( 

79 "best-effort decryption, falls through to next method", 

80 exc_info=True, 

81 ) 

82 

83 # Alternative method 2 - try using the GUID part 

84 try: 

85 if "GUID" in password: 

86 guid_part = password.split("GUID")[1].strip() 

87 encrypted = base64.b64decode(ciphertext_b64) 

88 key = derive_key(guid_part, len(encrypted)) 

89 decrypted = bytes( 

90 a ^ b for a, b in zip(encrypted, key, strict=False) 

91 ) 

92 

93 result = decrypted.decode("utf-8", errors="replace") 

94 if ( 94 ↛ 109line 94 didn't jump to line 109 because the condition on line 94 was always true

95 all(32 <= ord(c) < 127 for c in result[:50]) 

96 and " " in result[:50] 

97 ): 

98 logger.debug( 

99 f"Successfully decrypted with GUID method: {result[:50]}..." 

100 ) 

101 return result 

102 except Exception: 

103 logger.debug( 

104 "best-effort decryption, falls through to next method", 

105 exc_info=True, 

106 ) 

107 

108 # Alternative method 3 - hardcoded key for BrowseComp 

109 try: 

110 hardcoded_key = "MHGGF2022!" # Known key for BrowseComp dataset 

111 encrypted = base64.b64decode(ciphertext_b64) 

112 key = derive_key(hardcoded_key, len(encrypted)) 

113 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False)) 

114 

115 result = decrypted.decode("utf-8", errors="replace") 

116 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]: 

117 logger.debug( 

118 f"Successfully decrypted with hardcoded key: {result[:50]}..." 

119 ) 

120 return result 

121 except Exception: 

122 logger.debug( 

123 "best-effort decryption, falls through to failure handling", 

124 exc_info=True, 

125 ) 

126 

127 # If all attempts fail, return the original 

128 logger.debug( 

129 f"All decryption attempts failed for: {ciphertext_b64[:20]}..." 

130 ) 

131 return ciphertext_b64 

132 

133 

134def get_known_answer_map() -> Dict[str, str]: 

135 """Get a mapping of known encrypted answers to their decrypted values. 

136 

137 This function maintains a catalog of known encrypted strings that 

138 couldn't be automatically decrypted, along with their verified 

139 plaintext values. 

140 

141 Returns: 

142 Dictionary mapping encrypted strings to their plaintext values. 

143 """ 

144 return { 

145 "dFoTn+K+bcdyWg==": "Tooth Rock", 

146 "ERFIwA==": "1945", 

147 # Add more mappings as they are discovered during benchmark runs 

148 }