Coverage for src / local_deep_research / benchmarks / datasets / xbench_deepsearch.py: 14%

99 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2xbench-DeepSearch dataset implementation. 

3 

4This module provides a class for the xbench-DeepSearch benchmark dataset, 

5which evaluates deep research and search capabilities. 

6""" 

7 

8import base64 

9from typing import Any, Dict, List, Optional 

10from loguru import logger 

11from .base import BenchmarkDataset 

12 

13 

14class XBenchDeepSearchDataset(BenchmarkDataset): 

15 """xbench-DeepSearch benchmark dataset for deep research evaluation.""" 

16 

17 @staticmethod 

18 def xor_decrypt(data: bytes, key: str) -> bytes: 

19 """XOR decrypt data with a key.""" 

20 key_bytes = key.encode("utf-8") 

21 key_length = len(key_bytes) 

22 return bytes( 

23 [data[i] ^ key_bytes[i % key_length] for i in range(len(data))] 

24 ) 

25 

26 @classmethod 

27 def get_dataset_info(cls) -> Dict[str, str]: 

28 """Get basic information about the dataset.""" 

29 return { 

30 "id": "xbench_deepsearch", 

31 "name": "xbench-DeepSearch", 

32 "description": "Deep research and search capability evaluation (100 questions)", 

33 "url": "https://huggingface.co/datasets/xbench/DeepSearch", 

34 } 

35 

36 @classmethod 

37 def get_default_dataset_path(cls) -> str: 

38 """Get the default path for the dataset.""" 

39 return "xbench/DeepSearch" # Hugging Face dataset identifier 

40 

41 def load( 

42 self, 

43 dataset_path: str = None, 

44 num_examples: Optional[int] = None, 

45 seed: int = 42, 

46 ) -> List[Dict[str, Any]]: 

47 """Override load to handle HuggingFace datasets directly. 

48 

49 Args: 

50 dataset_path: Path to dataset (defaults to HuggingFace) 

51 num_examples: Optional number of examples to limit 

52 seed: Random seed for sampling 

53 

54 Returns: 

55 List of processed dataset examples 

56 """ 

57 import random 

58 

59 # Load the data 

60 data = self.load_data(dataset_path) 

61 

62 # Sample if requested 

63 if num_examples and len(data) > num_examples: 

64 # Security: seeded random for reproducible benchmark sampling, not security-sensitive 

65 random.seed(seed) 

66 data = random.sample(data, num_examples) 

67 

68 # Process each example 

69 return [self.process_example(item) for item in data] 

70 

71 def load_data( 

72 self, 

73 dataset_path: str = None, 

74 num_examples: int = None, 

75 seed: int = None, 

76 ) -> List[Dict[str, Any]]: 

77 """Load the xbench-DeepSearch dataset from Hugging Face. 

78 

79 Args: 

80 dataset_path: Path to dataset (defaults to Hugging Face) 

81 num_examples: Optional number of examples to limit 

82 seed: Random seed for sampling 

83 

84 Returns: 

85 List of questions from xbench-DeepSearch 

86 """ 

87 try: 

88 from datasets import load_dataset 

89 except ImportError: 

90 logger.exception( 

91 "datasets library not installed. Run: pip install datasets" 

92 ) 

93 # Fallback to direct download 

94 return self._load_from_url(num_examples, seed) 

95 

96 dataset_path = dataset_path or self.get_default_dataset_path() 

97 

98 try: 

99 logger.info( 

100 f"Loading xbench-DeepSearch dataset from {dataset_path}" 

101 ) 

102 

103 # Load the dataset from Hugging Face (no authentication needed) 

104 dataset = load_dataset(dataset_path, split="train") 

105 

106 # Format for our benchmark system and decrypt 

107 formatted_questions = [] 

108 for item in dataset: 

109 # Get the canary key for decryption 

110 canary = item.get("canary", "") 

111 

112 # Decrypt prompt and answer if they're encrypted 

113 prompt = item.get("prompt", "") 

114 answer = item.get("answer", "") 

115 

116 try: 

117 # Try to decrypt if it looks like base64 

118 if prompt and all( 

119 c 

120 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

121 for c in prompt[:100] 

122 ): 

123 decrypted_prompt = self.xor_decrypt( 

124 base64.b64decode(prompt), canary 

125 ).decode("utf-8") 

126 prompt = decrypted_prompt 

127 

128 if answer and all( 

129 c 

130 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

131 for c in answer[:100] 

132 ): 

133 decrypted_answer = self.xor_decrypt( 

134 base64.b64decode(answer), canary 

135 ).decode("utf-8") 

136 answer = decrypted_answer 

137 except Exception as e: 

138 logger.warning(f"Failed to decrypt item: {e}") 

139 

140 formatted_item = { 

141 "id": item.get("id", f"xbench_{len(formatted_questions)}"), 

142 "problem": prompt, 

143 "answer": answer, 

144 "reference_steps": item.get("reference_steps", ""), 

145 "canary": canary, 

146 } 

147 formatted_questions.append(formatted_item) 

148 

149 # Apply sampling if num_examples is specified 

150 if num_examples and num_examples < len(formatted_questions): 

151 import random 

152 

153 # Use provided seed or None for random sampling 

154 # Security: seeded random for reproducible benchmark sampling, not security-sensitive 

155 if seed is not None: 

156 random.seed(seed) 

157 formatted_questions = random.sample( 

158 formatted_questions, num_examples 

159 ) 

160 logger.info( 

161 f"Sampled {num_examples} questions from xbench-DeepSearch (total available: {len(dataset)})" 

162 ) 

163 else: 

164 logger.info( 

165 f"Loaded {len(formatted_questions)} questions from xbench-DeepSearch" 

166 ) 

167 return formatted_questions 

168 

169 except Exception as e: 

170 logger.warning(f"Failed to load via datasets library: {e}") 

171 logger.info("Falling back to direct download") 

172 return self._load_from_url(num_examples, seed) 

173 

174 def _load_from_url( 

175 self, num_examples: int = None, seed: int = None 

176 ) -> List[Dict[str, Any]]: 

177 """Load dataset directly from URL without datasets library. 

178 

179 Args: 

180 num_examples: Optional number of examples to limit 

181 seed: Random seed for sampling 

182 

183 Returns: 

184 List of questions from xbench-DeepSearch 

185 """ 

186 import pandas as pd 

187 

188 try: 

189 # Direct URL to the CSV file on Hugging Face 

190 url = "https://huggingface.co/datasets/xbench/DeepSearch/resolve/main/data/train-00000-of-00001.parquet" 

191 

192 logger.info(f"Downloading xbench-DeepSearch from {url}") 

193 df = pd.read_parquet(url) 

194 

195 # Convert to list of dicts and decrypt 

196 questions = [] 

197 for _, row in df.iterrows(): 

198 # Get the canary key for decryption 

199 canary = row.get("canary", "") 

200 

201 # Decrypt prompt and answer 

202 prompt = row.get("prompt", "") 

203 answer = row.get("answer", "") 

204 

205 try: 

206 if prompt and all( 

207 c 

208 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

209 for c in prompt[:100] 

210 ): 

211 decrypted_prompt = self.xor_decrypt( 

212 base64.b64decode(prompt), canary 

213 ).decode("utf-8") 

214 prompt = decrypted_prompt 

215 

216 if answer and all( 

217 c 

218 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 

219 for c in answer[:100] 

220 ): 

221 decrypted_answer = self.xor_decrypt( 

222 base64.b64decode(answer), canary 

223 ).decode("utf-8") 

224 answer = decrypted_answer 

225 except Exception as e: 

226 logger.warning(f"Failed to decrypt item: {e}") 

227 

228 questions.append( 

229 { 

230 "id": row.get("id", f"xbench_{len(questions)}"), 

231 "problem": prompt, 

232 "answer": answer, 

233 "reference_steps": row.get("reference_steps", ""), 

234 "canary": canary, 

235 } 

236 ) 

237 

238 # Apply sampling if num_examples is specified 

239 if num_examples and num_examples < len(questions): 

240 import random 

241 

242 # Use provided seed or None for random sampling 

243 # Security: seeded random for reproducible benchmark sampling, not security-sensitive 

244 if seed is not None: 

245 random.seed(seed) 

246 questions = random.sample(questions, num_examples) 

247 logger.info( 

248 f"Sampled {num_examples} questions via direct download (total available: {len(questions)})" 

249 ) 

250 else: 

251 logger.info( 

252 f"Loaded {len(questions)} questions via direct download" 

253 ) 

254 return questions 

255 

256 except Exception: 

257 logger.exception("Failed to load dataset") 

258 return [] 

259 

260 def process_example(self, example: Dict[str, Any]) -> Dict[str, Any]: 

261 """Process a single example from the dataset. 

262 

263 xbench-DeepSearch questions are designed for deep research evaluation. 

264 """ 

265 processed = dict(example) 

266 

267 # Add evaluation metadata 

268 processed["requires_deep_search"] = True 

269 processed["expected_iterations"] = ( 

270 4 # Deep search questions need multiple iterations 

271 ) 

272 

273 # Evaluation criteria for research questions 

274 processed["evaluation_criteria"] = { 

275 "accuracy": 0.4, 

276 "completeness": 0.3, 

277 "reasoning": 0.2, 

278 "sources": 0.1, # Credit for citing sources 

279 } 

280 

281 return processed