Coverage for src/local_deep_research/benchmarks/datasets/xbench

1"""

2xbench-DeepSearch dataset implementation.

4This module provides a class for the xbench-DeepSearch benchmark dataset,

5which evaluates deep research and search capabilities.

6"""

8import base64

9from typing import Any, Dict, List, Optional

10from loguru import logger

11from .base import BenchmarkDataset

14class XBenchDeepSearchDataset(BenchmarkDataset):

15 """xbench-DeepSearch benchmark dataset for deep research evaluation."""

17 @staticmethod

18 def xor_decrypt(data: bytes, key: str) -> bytes:

19 """XOR decrypt data with a key."""

20 key_bytes = key.encode("utf-8")

21 key_length = len(key_bytes)

22 return bytes(

23 [data[i] ^ key_bytes[i % key_length] for i in range(len(data))]

24 )

26 @classmethod

27 def get_dataset_info(cls) -> Dict[str, str]:

28 """Get basic information about the dataset."""

29 return {

30 "id": "xbench_deepsearch",

31 "name": "xbench-DeepSearch",

32 "description": "Deep research and search capability evaluation (100 questions)",

33 "url": "https://huggingface.co/datasets/xbench/DeepSearch",

34 }

36 @classmethod

37 def get_default_dataset_path(cls) -> str:

38 """Get the default path for the dataset."""

39 return "xbench/DeepSearch" # Hugging Face dataset identifier

41 def load(

42 self,

43 dataset_path: str = None,

44 num_examples: Optional[int] = None,

45 seed: int = 42,

46 ) -> List[Dict[str, Any]]:

47 """Override load to handle HuggingFace datasets directly.

49 Args:

50 dataset_path: Path to dataset (defaults to HuggingFace)

51 num_examples: Optional number of examples to limit

52 seed: Random seed for sampling

54 Returns:

55 List of processed dataset examples

56 """

57 import random

59 # Load the data

60 data = self.load_data(dataset_path)

62 # Sample if requested

63 if num_examples and len(data) > num_examples:

64 # Security: seeded random for reproducible benchmark sampling, not security-sensitive

65 random.seed(seed)

66 data = random.sample(data, num_examples)

68 # Process each example

69 return [self.process_example(item) for item in data]

71 def load_data(

72 self,

73 dataset_path: str = None,

74 num_examples: int = None,

75 seed: int = None,

76 ) -> List[Dict[str, Any]]:

77 """Load the xbench-DeepSearch dataset from Hugging Face.

79 Args:

80 dataset_path: Path to dataset (defaults to Hugging Face)

81 num_examples: Optional number of examples to limit

82 seed: Random seed for sampling

84 Returns:

85 List of questions from xbench-DeepSearch

86 """

87 try:

88 from datasets import load_dataset

89 except ImportError:

90 logger.exception(

91 "datasets library not installed. Run: pip install datasets"

92 )

93 # Fallback to direct download

94 return self._load_from_url(num_examples, seed)

96 dataset_path = dataset_path or self.get_default_dataset_path()

98 try:

99 logger.info(

100 f"Loading xbench-DeepSearch dataset from {dataset_path}"

101 )

102

103 # Load the dataset from Hugging Face (no authentication needed)

104 dataset = load_dataset(dataset_path, split="train")

105

106 # Format for our benchmark system and decrypt

107 formatted_questions = []

108 for item in dataset:

109 # Get the canary key for decryption

110 canary = item.get("canary", "")

111

112 # Decrypt prompt and answer if they're encrypted

113 prompt = item.get("prompt", "")

114 answer = item.get("answer", "")

115

116 try:

117 # Try to decrypt if it looks like base64

118 if prompt and all(

119 c

120 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="

121 for c in prompt[:100]

122 ):

123 decrypted_prompt = self.xor_decrypt(

124 base64.b64decode(prompt), canary

125 ).decode("utf-8")

126 prompt = decrypted_prompt

127

128 if answer and all(

129 c

130 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="

131 for c in answer[:100]

132 ):

133 decrypted_answer = self.xor_decrypt(

134 base64.b64decode(answer), canary

135 ).decode("utf-8")

136 answer = decrypted_answer

137 except Exception as e:

138 logger.warning(f"Failed to decrypt item: {e}")

139

140 formatted_item = {

141 "id": item.get("id", f"xbench_{len(formatted_questions)}"),

142 "problem": prompt,

143 "answer": answer,

144 "reference_steps": item.get("reference_steps", ""),

145 "canary": canary,

146 }

147 formatted_questions.append(formatted_item)

148

149 # Apply sampling if num_examples is specified

150 if num_examples and num_examples < len(formatted_questions):

151 import random

152

153 # Use provided seed or None for random sampling

154 # Security: seeded random for reproducible benchmark sampling, not security-sensitive

155 if seed is not None:

156 random.seed(seed)

157 formatted_questions = random.sample(

158 formatted_questions, num_examples

159 )

160 logger.info(

161 f"Sampled {num_examples} questions from xbench-DeepSearch (total available: {len(dataset)})"

162 )

163 else:

164 logger.info(

165 f"Loaded {len(formatted_questions)} questions from xbench-DeepSearch"

166 )

167 return formatted_questions

168

169 except Exception as e:

170 logger.warning(f"Failed to load via datasets library: {e}")

171 logger.info("Falling back to direct download")

172 return self._load_from_url(num_examples, seed)

173

174 def _load_from_url(

175 self, num_examples: int = None, seed: int = None

176 ) -> List[Dict[str, Any]]:

177 """Load dataset directly from URL without datasets library.

178

179 Args:

180 num_examples: Optional number of examples to limit

181 seed: Random seed for sampling

182

183 Returns:

184 List of questions from xbench-DeepSearch

185 """

186 import pandas as pd

187

188 try:

189 # Direct URL to the CSV file on Hugging Face

190 url = "https://huggingface.co/datasets/xbench/DeepSearch/resolve/main/data/train-00000-of-00001.parquet"

191

192 logger.info(f"Downloading xbench-DeepSearch from {url}")

193 df = pd.read_parquet(url)

194

195 # Convert to list of dicts and decrypt

196 questions = []

197 for _, row in df.iterrows():

198 # Get the canary key for decryption

199 canary = row.get("canary", "")

200

201 # Decrypt prompt and answer

202 prompt = row.get("prompt", "")

203 answer = row.get("answer", "")

204

205 try:

206 if prompt and all(

207 c

208 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="

209 for c in prompt[:100]

210 ):

211 decrypted_prompt = self.xor_decrypt(

212 base64.b64decode(prompt), canary

213 ).decode("utf-8")

214 prompt = decrypted_prompt

215

216 if answer and all(

217 c

218 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="

219 for c in answer[:100]

220 ):

221 decrypted_answer = self.xor_decrypt(

222 base64.b64decode(answer), canary

223 ).decode("utf-8")

224 answer = decrypted_answer

225 except Exception as e:

226 logger.warning(f"Failed to decrypt item: {e}")

227

228 questions.append(

229 {

230 "id": row.get("id", f"xbench_{len(questions)}"),

231 "problem": prompt,

232 "answer": answer,

233 "reference_steps": row.get("reference_steps", ""),

234 "canary": canary,

235 }

236 )

237

238 # Apply sampling if num_examples is specified

239 if num_examples and num_examples < len(questions):

240 import random

241

242 # Use provided seed or None for random sampling

243 # Security: seeded random for reproducible benchmark sampling, not security-sensitive

244 if seed is not None:

245 random.seed(seed)

246 questions = random.sample(questions, num_examples)

247 logger.info(

248 f"Sampled {num_examples} questions via direct download (total available: {len(questions)})"

249 )

250 else:

251 logger.info(

252 f"Loaded {len(questions)} questions via direct download"

253 )

254 return questions

255

256 except Exception:

257 logger.exception("Failed to load dataset")

258 return []

259

260 def process_example(self, example: Dict[str, Any]) -> Dict[str, Any]:

261 """Process a single example from the dataset.

262

263 xbench-DeepSearch questions are designed for deep research evaluation.

264 """

265 processed = dict(example)

266

267 # Add evaluation metadata

268 processed["requires_deep_search"] = True

269 processed["expected_iterations"] = (

270 4 # Deep search questions need multiple iterations

271 )

272

273 # Evaluation criteria for research questions

274 processed["evaluation_criteria"] = {

275 "accuracy": 0.4,

276 "completeness": 0.3,

277 "reasoning": 0.2,

278 "sources": 0.1, # Credit for citing sources

279 }

280

281 return processed

Coverage for src / local_deep_research / benchmarks / datasets / xbench_deepsearch.py: 14%

99 statements