Coverage for src / local_deep_research / benchmarks / datasets / utils.py: 62%
63 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Utility functions for dataset handling.
4This module provides utility functions for common dataset operations like
5decryption, encoding detection, etc.
6"""
8import base64
9import hashlib
10from typing import Dict
12from loguru import logger
15def derive_key(password: str, length: int) -> bytes:
16 """Derive a fixed-length key from the password using SHA256."""
17 hasher = hashlib.sha256()
18 hasher.update(password.encode())
19 key = hasher.digest()
20 return key * (length // len(key)) + key[: length % len(key)]
23def decrypt(ciphertext_b64: str, password: str) -> str:
24 """
25 Decrypt base64-encoded ciphertext with XOR.
26 Uses multiple approaches to handle different encoding formats.
27 """
28 # Skip decryption for non-encoded strings
29 if not isinstance(ciphertext_b64, str) or len(ciphertext_b64) < 8:
30 return ciphertext_b64
32 # Skip if the string doesn't look like base64
33 if not all(
34 c
35 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" # pragma: allowlist secret
36 for c in ciphertext_b64
37 ):
38 return ciphertext_b64
40 # Attempt standard decryption
41 try:
42 encrypted = base64.b64decode(ciphertext_b64)
43 key = derive_key(password, len(encrypted))
44 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))
46 # Check if the result looks like valid text
47 result = decrypted.decode("utf-8", errors="replace")
49 # Heuristic check - if the decrypted text is mostly ASCII and contains spaces
50 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:
51 logger.debug(
52 f"Successfully decrypted with standard method: {result[:50]}..."
53 )
54 return result
55 except Exception as e:
56 logger.debug(f"Standard decryption failed: {e!s}")
58 # Alternative method - try using just the first part of the password
59 try:
60 if len(password) > 30: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 alt_password = password.split()[0] # Use first word
62 encrypted = base64.b64decode(ciphertext_b64)
63 key = derive_key(alt_password, len(encrypted))
64 decrypted = bytes(
65 a ^ b for a, b in zip(encrypted, key, strict=False)
66 )
68 result = decrypted.decode("utf-8", errors="replace")
69 if (
70 all(32 <= ord(c) < 127 for c in result[:50])
71 and " " in result[:50]
72 ):
73 logger.debug(
74 f"Successfully decrypted with alternate method 1: {result[:50]}..."
75 )
76 return result
77 except Exception:
78 pass
80 # Alternative method 2 - try using the GUID part
81 try:
82 if "GUID" in password: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 guid_part = password.split("GUID")[1].strip()
84 encrypted = base64.b64decode(ciphertext_b64)
85 key = derive_key(guid_part, len(encrypted))
86 decrypted = bytes(
87 a ^ b for a, b in zip(encrypted, key, strict=False)
88 )
90 result = decrypted.decode("utf-8", errors="replace")
91 if (
92 all(32 <= ord(c) < 127 for c in result[:50])
93 and " " in result[:50]
94 ):
95 logger.debug(
96 f"Successfully decrypted with GUID method: {result[:50]}..."
97 )
98 return result
99 except Exception:
100 pass
102 # Alternative method 3 - hardcoded key for BrowseComp
103 try:
104 hardcoded_key = "MHGGF2022!" # Known key for BrowseComp dataset
105 encrypted = base64.b64decode(ciphertext_b64)
106 key = derive_key(hardcoded_key, len(encrypted))
107 decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))
109 result = decrypted.decode("utf-8", errors="replace")
110 if all(32 <= ord(c) < 127 for c in result[:50]) and " " in result[:50]:
111 logger.debug(
112 f"Successfully decrypted with hardcoded key: {result[:50]}..."
113 )
114 return result
115 except Exception:
116 pass
118 # If all attempts fail, return the original
119 logger.debug(
120 f"All decryption attempts failed for: {ciphertext_b64[:20]}..."
121 )
122 return ciphertext_b64
125def get_known_answer_map() -> Dict[str, str]:
126 """Get a mapping of known encrypted answers to their decrypted values.
128 This function maintains a catalog of known encrypted strings that
129 couldn't be automatically decrypted, along with their verified
130 plaintext values.
132 Returns:
133 Dictionary mapping encrypted strings to their plaintext values.
134 """
135 return {
136 "dFoTn+K+bcdyWg==": "Tooth Rock",
137 "ERFIwA==": "1945",
138 # Add more mappings as they are discovered during benchmark runs
139 }