Coverage for src / local_deep_research / benchmarks / datasets / utils.py: 87%
63 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Utility functions for dataset handling.
4This module provides utility functions for common dataset operations like
5decryption, encoding detection, etc.
6"""
8import base64
9import hashlib
10from typing import Dict
12from loguru import logger
15def derive_key(password: str, length: int) -> bytes:
16 """Derive a fixed-length key from the password using SHA256."""
17 hasher = hashlib.sha256()
18 hasher.update(password.encode())
19 key = hasher.digest()
20 return key * (length // len(key)) + key[: length % len(key)]
23def decrypt(ciphertext_b64: str, password: str) -> str:
24 """
25 Decrypt base64-encoded ciphertext with XOR.
26 Uses multiple approaches to handle different encoding formats.
27 """
28 # Skip decryption for non-encoded strings
29 if not isinstance(ciphertext_b64, str) or len(ciphertext_b64) < 8:
30 return ciphertext_b64
32 # Skip if the string doesn't look like base64
33 if not all(
34 c
35 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" # pragma: allowlist secret
36 for c in ciphertext_b64
37 ):
38 return ciphertext_b64
40 # Attempt standard decryption
41 try:
42 encrypted = base64.b64decode(ciphertext_b64)
43 key = derive_key(password, len(encrypted))
44 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))
46 # Check if the result looks like valid text
47 result = decrypted.decode("utf-8", errors="replace")
49 # Heuristic check - if the decrypted text is mostly ASCII and contains spaces
50 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:
51 logger.debug(
52 f"Successfully decrypted with standard method: {result[:50]}..."
53 )
54 return result
55 except Exception as e:
56 logger.debug(f"Standard decryption failed: {e!s}")
58 # Alternative method - try using just the first part of the password
59 try:
60 if len(password) > 30:
61 alt_password = password.split()[0] # Use first word
62 encrypted = base64.b64decode(ciphertext_b64)
63 key = derive_key(alt_password, len(encrypted))
64 decrypted = bytes(
65 a ^ b for a, b in zip(encrypted, key, strict=False)
66 )
68 result = decrypted.decode("utf-8", errors="replace")
69 if ( 69 ↛ 84line 69 didn't jump to line 84 because the condition on line 69 was always true
70 all(32 <= ord(c) < 127 for c in result[:50])
71 and " " in result[:50]
72 ):
73 logger.debug(
74 f"Successfully decrypted with alternate method 1: {result[:50]}..."
75 )
76 return result
77 except Exception:
78 logger.debug(
79 "best-effort decryption, falls through to next method",
80 exc_info=True,
81 )
83 # Alternative method 2 - try using the GUID part
84 try:
85 if "GUID" in password:
86 guid_part = password.split("GUID")[1].strip()
87 encrypted = base64.b64decode(ciphertext_b64)
88 key = derive_key(guid_part, len(encrypted))
89 decrypted = bytes(
90 a ^ b for a, b in zip(encrypted, key, strict=False)
91 )
93 result = decrypted.decode("utf-8", errors="replace")
94 if ( 94 ↛ 109line 94 didn't jump to line 109 because the condition on line 94 was always true
95 all(32 <= ord(c) < 127 for c in result[:50])
96 and " " in result[:50]
97 ):
98 logger.debug(
99 f"Successfully decrypted with GUID method: {result[:50]}..."
100 )
101 return result
102 except Exception:
103 logger.debug(
104 "best-effort decryption, falls through to next method",
105 exc_info=True,
106 )
108 # Alternative method 3 - hardcoded key for BrowseComp
109 try:
110 hardcoded_key = "MHGGF2022!" # Known key for BrowseComp dataset
111 encrypted = base64.b64decode(ciphertext_b64)
112 key = derive_key(hardcoded_key, len(encrypted))
113 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))
115 result = decrypted.decode("utf-8", errors="replace")
116 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:
117 logger.debug(
118 f"Successfully decrypted with hardcoded key: {result[:50]}..."
119 )
120 return result
121 except Exception:
122 logger.debug(
123 "best-effort decryption, falls through to failure handling",
124 exc_info=True,
125 )
127 # If all attempts fail, return the original
128 logger.debug(
129 f"All decryption attempts failed for: {ciphertext_b64[:20]}..."
130 )
131 return ciphertext_b64
134def get_known_answer_map() -> Dict[str, str]:
135 """Get a mapping of known encrypted answers to their decrypted values.
137 This function maintains a catalog of known encrypted strings that
138 couldn't be automatically decrypted, along with their verified
139 plaintext values.
141 Returns:
142 Dictionary mapping encrypted strings to their plaintext values.
143 """
144 return {
145 "dFoTn+K+bcdyWg==": "Tooth Rock",
146 "ERFIwA==": "1945",
147 # Add more mappings as they are discovered during benchmark runs
148 }