Coverage for src / local_deep_research / benchmarks / datasets / utils.py: 62%

63 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Utility functions for dataset handling. 

3 

4This module provides utility functions for common dataset operations like 

5decryption, encoding detection, etc. 

6""" 

7 

8import base64 

9import hashlib 

10from typing import Dict 

11 

12from loguru import logger 

13 

14 

15def derive_key(password: str, length: int) -> bytes: 

16 """Derive a fixed-length key from the password using SHA256.""" 

17 hasher = hashlib.sha256() 

18 hasher.update(password.encode()) 

19 key = hasher.digest() 

20 return key * (length // len(key)) + key[: length % len(key)] 

21 

22 

23def decrypt(ciphertext_b64: str, password: str) -> str: 

24 """ 

25 Decrypt base64-encoded ciphertext with XOR. 

26 Uses multiple approaches to handle different encoding formats. 

27 """ 

28 # Skip decryption for non-encoded strings 

29 if not isinstance(ciphertext_b64, str) or len(ciphertext_b64) < 8: 

30 return ciphertext_b64 

31 

32 # Skip if the string doesn't look like base64 

33 if not all( 

34 c 

35 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" # pragma: allowlist secret 

36 for c in ciphertext_b64 

37 ): 

38 return ciphertext_b64 

39 

40 # Attempt standard decryption 

41 try: 

42 encrypted = base64.b64decode(ciphertext_b64) 

43 key = derive_key(password, len(encrypted)) 

44 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False)) 

45 

46 # Check if the result looks like valid text 

47 result = decrypted.decode("utf-8", errors="replace") 

48 

49 # Heuristic check - if the decrypted text is mostly ASCII and contains spaces 

50 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]: 

51 logger.debug( 

52 f"Successfully decrypted with standard method: {result[:50]}..." 

53 ) 

54 return result 

55 except Exception as e: 

56 logger.debug(f"Standard decryption failed: {e!s}") 

57 

58 # Alternative method - try using just the first part of the password 

59 try: 

60 if len(password) > 30: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 alt_password = password.split()[0] # Use first word 

62 encrypted = base64.b64decode(ciphertext_b64) 

63 key = derive_key(alt_password, len(encrypted)) 

64 decrypted = bytes( 

65 a ^ b for a, b in zip(encrypted, key, strict=False) 

66 ) 

67 

68 result = decrypted.decode("utf-8", errors="replace") 

69 if ( 

70 all(32 <= ord(c) < 127 for c in result[:50]) 

71 and " " in result[:50] 

72 ): 

73 logger.debug( 

74 f"Successfully decrypted with alternate method 1: {result[:50]}..." 

75 ) 

76 return result 

77 except Exception: 

78 pass 

79 

80 # Alternative method 2 - try using the GUID part 

81 try: 

82 if "GUID" in password: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 guid_part = password.split("GUID")[1].strip() 

84 encrypted = base64.b64decode(ciphertext_b64) 

85 key = derive_key(guid_part, len(encrypted)) 

86 decrypted = bytes( 

87 a ^ b for a, b in zip(encrypted, key, strict=False) 

88 ) 

89 

90 result = decrypted.decode("utf-8", errors="replace") 

91 if ( 

92 all(32 <= ord(c) < 127 for c in result[:50]) 

93 and " " in result[:50] 

94 ): 

95 logger.debug( 

96 f"Successfully decrypted with GUID method: {result[:50]}..." 

97 ) 

98 return result 

99 except Exception: 

100 pass 

101 

102 # Alternative method 3 - hardcoded key for BrowseComp 

103 try: 

104 hardcoded_key = "MHGGF2022!" # Known key for BrowseComp dataset 

105 encrypted = base64.b64decode(ciphertext_b64) 

106 key = derive_key(hardcoded_key, len(encrypted)) 

107 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False)) 

108 

109 result = decrypted.decode("utf-8", errors="replace") 

110 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]: 

111 logger.debug( 

112 f"Successfully decrypted with hardcoded key: {result[:50]}..." 

113 ) 

114 return result 

115 except Exception: 

116 pass 

117 

118 # If all attempts fail, return the original 

119 logger.debug( 

120 f"All decryption attempts failed for: {ciphertext_b64[:20]}..." 

121 ) 

122 return ciphertext_b64 

123 

124 

125def get_known_answer_map() -> Dict[str, str]: 

126 """Get a mapping of known encrypted answers to their decrypted values. 

127 

128 This function maintains a catalog of known encrypted strings that 

129 couldn't be automatically decrypted, along with their verified 

130 plaintext values. 

131 

132 Returns: 

133 Dictionary mapping encrypted strings to their plaintext values. 

134 """ 

135 return { 

136 "dFoTn+K+bcdyWg==": "Tooth Rock", 

137 "ERFIwA==": "1945", 

138 # Add more mappings as they are discovered during benchmark runs 

139 }