Coverage for src / local_deep_research / benchmarks / datasets / xbench_deepsearch.py: 14%
99 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2xbench-DeepSearch dataset implementation.
4This module provides a class for the xbench-DeepSearch benchmark dataset,
5which evaluates deep research and search capabilities.
6"""
8import base64
9from typing import Any, Dict, List, Optional
10from loguru import logger
11from .base import BenchmarkDataset
14class XBenchDeepSearchDataset(BenchmarkDataset):
15 """xbench-DeepSearch benchmark dataset for deep research evaluation."""
17 @staticmethod
18 def xor_decrypt(data: bytes, key: str) -> bytes:
19 """XOR decrypt data with a key."""
20 key_bytes = key.encode("utf-8")
21 key_length = len(key_bytes)
22 return bytes(
23 [data[i] ^ key_bytes[i % key_length] for i in range(len(data))]
24 )
26 @classmethod
27 def get_dataset_info(cls) -> Dict[str, str]:
28 """Get basic information about the dataset."""
29 return {
30 "id": "xbench_deepsearch",
31 "name": "xbench-DeepSearch",
32 "description": "Deep research and search capability evaluation (100 questions)",
33 "url": "https://huggingface.co/datasets/xbench/DeepSearch",
34 }
36 @classmethod
37 def get_default_dataset_path(cls) -> str:
38 """Get the default path for the dataset."""
39 return "xbench/DeepSearch" # Hugging Face dataset identifier
41 def load(
42 self,
43 dataset_path: str = None,
44 num_examples: Optional[int] = None,
45 seed: int = 42,
46 ) -> List[Dict[str, Any]]:
47 """Override load to handle HuggingFace datasets directly.
49 Args:
50 dataset_path: Path to dataset (defaults to HuggingFace)
51 num_examples: Optional number of examples to limit
52 seed: Random seed for sampling
54 Returns:
55 List of processed dataset examples
56 """
57 import random
59 # Load the data
60 data = self.load_data(dataset_path)
62 # Sample if requested
63 if num_examples and len(data) > num_examples:
64 random.seed(seed)
65 data = random.sample(data, num_examples)
67 # Process each example
68 return [self.process_example(item) for item in data]
70 def load_data(
71 self,
72 dataset_path: str = None,
73 num_examples: int = None,
74 seed: int = None,
75 ) -> List[Dict[str, Any]]:
76 """Load the xbench-DeepSearch dataset from Hugging Face.
78 Args:
79 dataset_path: Path to dataset (defaults to Hugging Face)
80 num_examples: Optional number of examples to limit
81 seed: Random seed for sampling
83 Returns:
84 List of questions from xbench-DeepSearch
85 """
86 try:
87 from datasets import load_dataset
88 except ImportError:
89 logger.exception(
90 "datasets library not installed. Run: pip install datasets"
91 )
92 # Fallback to direct download
93 return self._load_from_url(num_examples, seed)
95 dataset_path = dataset_path or self.get_default_dataset_path()
97 try:
98 logger.info(
99 f"Loading xbench-DeepSearch dataset from {dataset_path}"
100 )
102 # Load the dataset from Hugging Face (no authentication needed)
103 dataset = load_dataset(dataset_path, split="train")
105 # Format for our benchmark system and decrypt
106 formatted_questions = []
107 for item in dataset:
108 # Get the canary key for decryption
109 canary = item.get("canary", "")
111 # Decrypt prompt and answer if they're encrypted
112 prompt = item.get("prompt", "")
113 answer = item.get("answer", "")
115 try:
116 # Try to decrypt if it looks like base64
117 if prompt and all(
118 c
119 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
120 for c in prompt[:100]
121 ):
122 decrypted_prompt = self.xor_decrypt(
123 base64.b64decode(prompt), canary
124 ).decode("utf-8")
125 prompt = decrypted_prompt
127 if answer and all(
128 c
129 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
130 for c in answer[:100]
131 ):
132 decrypted_answer = self.xor_decrypt(
133 base64.b64decode(answer), canary
134 ).decode("utf-8")
135 answer = decrypted_answer
136 except Exception as e:
137 logger.warning(f"Failed to decrypt item: {e}")
139 formatted_item = {
140 "id": item.get("id", f"xbench_{len(formatted_questions)}"),
141 "problem": prompt,
142 "answer": answer,
143 "reference_steps": item.get("reference_steps", ""),
144 "canary": canary,
145 }
146 formatted_questions.append(formatted_item)
148 # Apply sampling if num_examples is specified
149 if num_examples and num_examples < len(formatted_questions):
150 import random
152 # Use provided seed or None for random sampling
153 if seed is not None:
154 random.seed(seed)
155 formatted_questions = random.sample(
156 formatted_questions, num_examples
157 )
158 logger.info(
159 f"Sampled {num_examples} questions from xbench-DeepSearch (total available: {len(dataset)})"
160 )
161 else:
162 logger.info(
163 f"Loaded {len(formatted_questions)} questions from xbench-DeepSearch"
164 )
165 return formatted_questions
167 except Exception as e:
168 logger.warning(f"Failed to load via datasets library: {e}")
169 logger.info("Falling back to direct download")
170 return self._load_from_url(num_examples, seed)
172 def _load_from_url(
173 self, num_examples: int = None, seed: int = None
174 ) -> List[Dict[str, Any]]:
175 """Load dataset directly from URL without datasets library.
177 Args:
178 num_examples: Optional number of examples to limit
179 seed: Random seed for sampling
181 Returns:
182 List of questions from xbench-DeepSearch
183 """
184 import pandas as pd
186 try:
187 # Direct URL to the CSV file on Hugging Face
188 url = "https://huggingface.co/datasets/xbench/DeepSearch/resolve/main/data/train-00000-of-00001.parquet"
190 logger.info(f"Downloading xbench-DeepSearch from {url}")
191 df = pd.read_parquet(url)
193 # Convert to list of dicts and decrypt
194 questions = []
195 for _, row in df.iterrows():
196 # Get the canary key for decryption
197 canary = row.get("canary", "")
199 # Decrypt prompt and answer
200 prompt = row.get("prompt", "")
201 answer = row.get("answer", "")
203 try:
204 if prompt and all(
205 c
206 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
207 for c in prompt[:100]
208 ):
209 decrypted_prompt = self.xor_decrypt(
210 base64.b64decode(prompt), canary
211 ).decode("utf-8")
212 prompt = decrypted_prompt
214 if answer and all(
215 c
216 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
217 for c in answer[:100]
218 ):
219 decrypted_answer = self.xor_decrypt(
220 base64.b64decode(answer), canary
221 ).decode("utf-8")
222 answer = decrypted_answer
223 except Exception as e:
224 logger.warning(f"Failed to decrypt item: {e}")
226 questions.append(
227 {
228 "id": row.get("id", f"xbench_{len(questions)}"),
229 "problem": prompt,
230 "answer": answer,
231 "reference_steps": row.get("reference_steps", ""),
232 "canary": canary,
233 }
234 )
236 # Apply sampling if num_examples is specified
237 if num_examples and num_examples < len(questions):
238 import random
240 # Use provided seed or None for random sampling
241 if seed is not None:
242 random.seed(seed)
243 questions = random.sample(questions, num_examples)
244 logger.info(
245 f"Sampled {num_examples} questions via direct download (total available: {len(questions)})"
246 )
247 else:
248 logger.info(
249 f"Loaded {len(questions)} questions via direct download"
250 )
251 return questions
253 except Exception:
254 logger.exception("Failed to load dataset")
255 return []
257 def process_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
258 """Process a single example from the dataset.
260 xbench-DeepSearch questions are designed for deep research evaluation.
261 """
262 processed = dict(example)
264 # Add evaluation metadata
265 processed["requires_deep_search"] = True
266 processed["expected_iterations"] = (
267 4 # Deep search questions need multiple iterations
268 )
270 # Evaluation criteria for research questions
271 processed["evaluation_criteria"] = {
272 "accuracy": 0.4,
273 "completeness": 0.3,
274 "reasoning": 0.2,
275 "sources": 0.1, # Credit for citing sources
276 }
278 return processed