Coverage for src / local_deep_research / benchmarks / datasets / xbench_deepsearch.py: 14%

99 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2xbench-DeepSearch dataset implementation. 

3 

4This module provides a class for the xbench-DeepSearch benchmark dataset, 

5which evaluates deep research and search capabilities. 

6""" 

7 

8import base64 

9from typing import Any, Dict, List, Optional 

10from loguru import logger 

11from .base import BenchmarkDataset 

12 

13 

14class XBenchDeepSearchDataset(BenchmarkDataset): 

15 """xbench-DeepSearch benchmark dataset for deep research evaluation.""" 

16 

17 @staticmethod 

18 def xor_decrypt(data: bytes, key: str) -> bytes: 

19 """XOR decrypt data with a key.""" 

20 key_bytes = key.encode("utf-8") 

21 key_length = len(key_bytes) 

22 return bytes( 

23 [data[i] ^ key_bytes[i % key_length] for i in range(len(data))] 

24 ) 

25 

26 @classmethod 

27 def get_dataset_info(cls) -> Dict[str, str]: 

28 """Get basic information about the dataset.""" 

29 return { 

30 "id": "xbench_deepsearch", 

31 "name": "xbench-DeepSearch", 

32 "description": "Deep research and search capability evaluation (100 questions)", 

33 "url": "https://huggingface.co/datasets/xbench/DeepSearch", 

34 } 

35 

36 @classmethod 

37 def get_default_dataset_path(cls) -> str: 

38 """Get the default path for the dataset.""" 

39 return "xbench/DeepSearch" # Hugging Face dataset identifier 

40 

41 def load( 

42 self, 

43 dataset_path: str = None, 

44 num_examples: Optional[int] = None, 

45 seed: int = 42, 

46 ) -> List[Dict[str, Any]]: 

47 """Override load to handle HuggingFace datasets directly. 

48 

49 Args: 

50 dataset_path: Path to dataset (defaults to HuggingFace) 

51 num_examples: Optional number of examples to limit 

52 seed: Random seed for sampling 

53 

54 Returns: 

55 List of processed dataset examples 

56 """ 

57 import random 

58 

59 # Load the data 

60 data = self.load_data(dataset_path) 

61 

62 # Sample if requested 

63 if num_examples and len(data) > num_examples: 

64 random.seed(seed) 

65 data = random.sample(data, num_examples) 

66 

67 # Process each example 

68 return [self.process_example(item) for item in data] 

69 

70 def load_data( 

71 self, 

72 dataset_path: str = None, 

73 num_examples: int = None, 

74 seed: int = None, 

75 ) -> List[Dict[str, Any]]: 

76 """Load the xbench-DeepSearch dataset from Hugging Face. 

77 

78 Args: 

79 dataset_path: Path to dataset (defaults to Hugging Face) 

80 num_examples: Optional number of examples to limit 

81 seed: Random seed for sampling 

82 

83 Returns: 

84 List of questions from xbench-DeepSearch 

85 """ 

86 try: 

87 from datasets import load_dataset 

88 except ImportError: 

89 logger.exception( 

90 "datasets library not installed. Run: pip install datasets" 

91 ) 

92 # Fallback to direct download 

93 return self._load_from_url(num_examples, seed) 

94 

95 dataset_path = dataset_path or self.get_default_dataset_path() 

96 

97 try: 

98 logger.info( 

99 f"Loading xbench-DeepSearch dataset from {dataset_path}" 

100 ) 

101 

102 # Load the dataset from Hugging Face (no authentication needed) 

103 dataset = load_dataset(dataset_path, split="train") 

104 

105 # Format for our benchmark system and decrypt 

106 formatted_questions = [] 

107 for item in dataset: 

108 # Get the canary key for decryption 

109 canary = item.get("canary", "") 

110 

111 # Decrypt prompt and answer if they're encrypted 

112 prompt = item.get("prompt", "") 

113 answer = item.get("answer", "") 

114 

115 try: 

116 # Try to decrypt if it looks like base64 

117 if prompt and all( 

118 c 

119 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

120 for c in prompt[:100] 

121 ): 

122 decrypted_prompt = self.xor_decrypt( 

123 base64.b64decode(prompt), canary 

124 ).decode("utf-8") 

125 prompt = decrypted_prompt 

126 

127 if answer and all( 

128 c 

129 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

130 for c in answer[:100] 

131 ): 

132 decrypted_answer = self.xor_decrypt( 

133 base64.b64decode(answer), canary 

134 ).decode("utf-8") 

135 answer = decrypted_answer 

136 except Exception as e: 

137 logger.warning(f"Failed to decrypt item: {e}") 

138 

139 formatted_item = { 

140 "id": item.get("id", f"xbench_{len(formatted_questions)}"), 

141 "problem": prompt, 

142 "answer": answer, 

143 "reference_steps": item.get("reference_steps", ""), 

144 "canary": canary, 

145 } 

146 formatted_questions.append(formatted_item) 

147 

148 # Apply sampling if num_examples is specified 

149 if num_examples and num_examples < len(formatted_questions): 

150 import random 

151 

152 # Use provided seed or None for random sampling 

153 if seed is not None: 

154 random.seed(seed) 

155 formatted_questions = random.sample( 

156 formatted_questions, num_examples 

157 ) 

158 logger.info( 

159 f"Sampled {num_examples} questions from xbench-DeepSearch (total available: {len(dataset)})" 

160 ) 

161 else: 

162 logger.info( 

163 f"Loaded {len(formatted_questions)} questions from xbench-DeepSearch" 

164 ) 

165 return formatted_questions 

166 

167 except Exception as e: 

168 logger.warning(f"Failed to load via datasets library: {e}") 

169 logger.info("Falling back to direct download") 

170 return self._load_from_url(num_examples, seed) 

171 

172 def _load_from_url( 

173 self, num_examples: int = None, seed: int = None 

174 ) -> List[Dict[str, Any]]: 

175 """Load dataset directly from URL without datasets library. 

176 

177 Args: 

178 num_examples: Optional number of examples to limit 

179 seed: Random seed for sampling 

180 

181 Returns: 

182 List of questions from xbench-DeepSearch 

183 """ 

184 import pandas as pd 

185 

186 try: 

187 # Direct URL to the CSV file on Hugging Face 

188 url = "https://huggingface.co/datasets/xbench/DeepSearch/resolve/main/data/train-00000-of-00001.parquet" 

189 

190 logger.info(f"Downloading xbench-DeepSearch from {url}") 

191 df = pd.read_parquet(url) 

192 

193 # Convert to list of dicts and decrypt 

194 questions = [] 

195 for _, row in df.iterrows(): 

196 # Get the canary key for decryption 

197 canary = row.get("canary", "") 

198 

199 # Decrypt prompt and answer 

200 prompt = row.get("prompt", "") 

201 answer = row.get("answer", "") 

202 

203 try: 

204 if prompt and all( 

205 c 

206 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

207 for c in prompt[:100] 

208 ): 

209 decrypted_prompt = self.xor_decrypt( 

210 base64.b64decode(prompt), canary 

211 ).decode("utf-8") 

212 prompt = decrypted_prompt 

213 

214 if answer and all( 

215 c 

216 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

217 for c in answer[:100] 

218 ): 

219 decrypted_answer = self.xor_decrypt( 

220 base64.b64decode(answer), canary 

221 ).decode("utf-8") 

222 answer = decrypted_answer 

223 except Exception as e: 

224 logger.warning(f"Failed to decrypt item: {e}") 

225 

226 questions.append( 

227 { 

228 "id": row.get("id", f"xbench_{len(questions)}"), 

229 "problem": prompt, 

230 "answer": answer, 

231 "reference_steps": row.get("reference_steps", ""), 

232 "canary": canary, 

233 } 

234 ) 

235 

236 # Apply sampling if num_examples is specified 

237 if num_examples and num_examples < len(questions): 

238 import random 

239 

240 # Use provided seed or None for random sampling 

241 if seed is not None: 

242 random.seed(seed) 

243 questions = random.sample(questions, num_examples) 

244 logger.info( 

245 f"Sampled {num_examples} questions via direct download (total available: {len(questions)})" 

246 ) 

247 else: 

248 logger.info( 

249 f"Loaded {len(questions)} questions via direct download" 

250 ) 

251 return questions 

252 

253 except Exception: 

254 logger.exception("Failed to load dataset") 

255 return [] 

256 

257 def process_example(self, example: Dict[str, Any]) -> Dict[str, Any]: 

258 """Process a single example from the dataset. 

259 

260 xbench-DeepSearch questions are designed for deep research evaluation. 

261 """ 

262 processed = dict(example) 

263 

264 # Add evaluation metadata 

265 processed["requires_deep_search"] = True 

266 processed["expected_iterations"] = ( 

267 4 # Deep search questions need multiple iterations 

268 ) 

269 

270 # Evaluation criteria for research questions 

271 processed["evaluation_criteria"] = { 

272 "accuracy": 0.4, 

273 "completeness": 0.3, 

274 "reasoning": 0.2, 

275 "sources": 0.1, # Credit for citing sources 

276 } 

277 

278 return processed