Coverage for src/local_deep_research/benchmarks/datasets/utils.py: 62%

1"""

2Utility functions for dataset handling.

4This module provides utility functions for common dataset operations like

5decryption, encoding detection, etc.

6"""

8import base64

9import hashlib

10from typing import Dict

12from loguru import logger

15def derive_key(password: str, length: int) -> bytes:

16 """Derive a fixed-length key from the password using SHA256."""

17 hasher = hashlib.sha256()

18 hasher.update(password.encode())

19 key = hasher.digest()

20 return key * (length // len(key)) + key[: length % len(key)]

23def decrypt(ciphertext_b64: str, password: str) -> str:

24 """

25 Decrypt base64-encoded ciphertext with XOR.

26 Uses multiple approaches to handle different encoding formats.

27 """

28 # Skip decryption for non-encoded strings

29 if not isinstance(ciphertext_b64, str) or len(ciphertext_b64) < 8:

30 return ciphertext_b64

32 # Skip if the string doesn't look like base64

33 if not all(

34 c

35 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" # pragma: allowlist secret

36 for c in ciphertext_b64

37 ):

38 return ciphertext_b64

40 # Attempt standard decryption

41 try:

42 encrypted = base64.b64decode(ciphertext_b64)

43 key = derive_key(password, len(encrypted))

44 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))

46 # Check if the result looks like valid text

47 result = decrypted.decode("utf-8", errors="replace")

49 # Heuristic check - if the decrypted text is mostly ASCII and contains spaces

50 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:

51 logger.debug(

52 f"Successfully decrypted with standard method: {result[:50]}..."

53 )

54 return result

55 except Exception as e:

56 logger.debug(f"Standard decryption failed: {e!s}")

58 # Alternative method - try using just the first part of the password

59 try:

60 if len(password) > 30: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 alt_password = password.split()[0] # Use first word

62 encrypted = base64.b64decode(ciphertext_b64)

63 key = derive_key(alt_password, len(encrypted))

64 decrypted = bytes(

65 a ^ b for a, b in zip(encrypted, key, strict=False)

66 )

68 result = decrypted.decode("utf-8", errors="replace")

69 if (

70 all(32 <= ord(c) < 127 for c in result[:50])

71 and " " in result[:50]

72 ):

73 logger.debug(

74 f"Successfully decrypted with alternate method 1: {result[:50]}..."

75 )

76 return result

77 except Exception:

78 pass

80 # Alternative method 2 - try using the GUID part

81 try:

82 if "GUID" in password: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 guid_part = password.split("GUID")[1].strip()

84 encrypted = base64.b64decode(ciphertext_b64)

85 key = derive_key(guid_part, len(encrypted))

86 decrypted = bytes(

87 a ^ b for a, b in zip(encrypted, key, strict=False)

88 )

90 result = decrypted.decode("utf-8", errors="replace")

91 if (

92 all(32 <= ord(c) < 127 for c in result[:50])

93 and " " in result[:50]

94 ):

95 logger.debug(

96 f"Successfully decrypted with GUID method: {result[:50]}..."

97 )

98 return result

99 except Exception:

100 pass

101

102 # Alternative method 3 - hardcoded key for BrowseComp

103 try:

104 hardcoded_key = "MHGGF2022!" # Known key for BrowseComp dataset

105 encrypted = base64.b64decode(ciphertext_b64)

106 key = derive_key(hardcoded_key, len(encrypted))

107 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))

108

109 result = decrypted.decode("utf-8", errors="replace")

110 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:

111 logger.debug(

112 f"Successfully decrypted with hardcoded key: {result[:50]}..."

113 )

114 return result

115 except Exception:

116 pass

117

118 # If all attempts fail, return the original

119 logger.debug(

120 f"All decryption attempts failed for: {ciphertext_b64[:20]}..."

121 )

122 return ciphertext_b64

123

124

125def get_known_answer_map() -> Dict[str, str]:

126 """Get a mapping of known encrypted answers to their decrypted values.

127

128 This function maintains a catalog of known encrypted strings that

129 couldn't be automatically decrypted, along with their verified

130 plaintext values.

131

132 Returns:

133 Dictionary mapping encrypted strings to their plaintext values.

134 """

135 return {

136 "dFoTn+K+bcdyWg==": "Tooth Rock",

137 "ERFIwA==": "1945",

138 # Add more mappings as they are discovered during benchmark runs

139 }

Coverage for src / local_deep_research / benchmarks / datasets / utils.py: 62%

63 statements