Coverage for src / local_deep_research / benchmarks / datasets / xbench_deepsearch.py: 14%
99 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2xbench-DeepSearch dataset implementation.
4This module provides a class for the xbench-DeepSearch benchmark dataset,
5which evaluates deep research and search capabilities.
6"""
8import base64
9from typing import Any, Dict, List, Optional
10from loguru import logger
11from .base import BenchmarkDataset
14class XBenchDeepSearchDataset(BenchmarkDataset):
15 """xbench-DeepSearch benchmark dataset for deep research evaluation."""
17 @staticmethod
18 def xor_decrypt(data: bytes, key: str) -> bytes:
19 """XOR decrypt data with a key."""
20 key_bytes = key.encode("utf-8")
21 key_length = len(key_bytes)
22 return bytes(
23 [data[i] ^ key_bytes[i % key_length] for i in range(len(data))]
24 )
26 @classmethod
27 def get_dataset_info(cls) -> Dict[str, str]:
28 """Get basic information about the dataset."""
29 return {
30 "id": "xbench_deepsearch",
31 "name": "xbench-DeepSearch",
32 "description": "Deep research and search capability evaluation (100 questions)",
33 "url": "https://huggingface.co/datasets/xbench/DeepSearch",
34 }
36 @classmethod
37 def get_default_dataset_path(cls) -> str:
38 """Get the default path for the dataset."""
39 return "xbench/DeepSearch" # Hugging Face dataset identifier
41 def load(
42 self,
43 dataset_path: str = None,
44 num_examples: Optional[int] = None,
45 seed: int = 42,
46 ) -> List[Dict[str, Any]]:
47 """Override load to handle HuggingFace datasets directly.
49 Args:
50 dataset_path: Path to dataset (defaults to HuggingFace)
51 num_examples: Optional number of examples to limit
52 seed: Random seed for sampling
54 Returns:
55 List of processed dataset examples
56 """
57 import random
59 # Load the data
60 data = self.load_data(dataset_path)
62 # Sample if requested
63 if num_examples and len(data) > num_examples:
64 # Security: seeded random for reproducible benchmark sampling, not security-sensitive
65 random.seed(seed)
66 data = random.sample(data, num_examples)
68 # Process each example
69 return [self.process_example(item) for item in data]
71 def load_data(
72 self,
73 dataset_path: str = None,
74 num_examples: int = None,
75 seed: int = None,
76 ) -> List[Dict[str, Any]]:
77 """Load the xbench-DeepSearch dataset from Hugging Face.
79 Args:
80 dataset_path: Path to dataset (defaults to Hugging Face)
81 num_examples: Optional number of examples to limit
82 seed: Random seed for sampling
84 Returns:
85 List of questions from xbench-DeepSearch
86 """
87 try:
88 from datasets import load_dataset
89 except ImportError:
90 logger.exception(
91 "datasets library not installed. Run: pip install datasets"
92 )
93 # Fallback to direct download
94 return self._load_from_url(num_examples, seed)
96 dataset_path = dataset_path or self.get_default_dataset_path()
98 try:
99 logger.info(
100 f"Loading xbench-DeepSearch dataset from {dataset_path}"
101 )
103 # Load the dataset from Hugging Face (no authentication needed)
104 dataset = load_dataset(dataset_path, split="train")
106 # Format for our benchmark system and decrypt
107 formatted_questions = []
108 for item in dataset:
109 # Get the canary key for decryption
110 canary = item.get("canary", "")
112 # Decrypt prompt and answer if they're encrypted
113 prompt = item.get("prompt", "")
114 answer = item.get("answer", "")
116 try:
117 # Try to decrypt if it looks like base64
118 if prompt and all(
119 c
120 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
121 for c in prompt[:100]
122 ):
123 decrypted_prompt = self.xor_decrypt(
124 base64.b64decode(prompt), canary
125 ).decode("utf-8")
126 prompt = decrypted_prompt
128 if answer and all(
129 c
130 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
131 for c in answer[:100]
132 ):
133 decrypted_answer = self.xor_decrypt(
134 base64.b64decode(answer), canary
135 ).decode("utf-8")
136 answer = decrypted_answer
137 except Exception as e:
138 logger.warning(f"Failed to decrypt item: {e}")
140 formatted_item = {
141 "id": item.get("id", f"xbench_{len(formatted_questions)}"),
142 "problem": prompt,
143 "answer": answer,
144 "reference_steps": item.get("reference_steps", ""),
145 "canary": canary,
146 }
147 formatted_questions.append(formatted_item)
149 # Apply sampling if num_examples is specified
150 if num_examples and num_examples < len(formatted_questions):
151 import random
153 # Use provided seed or None for random sampling
154 # Security: seeded random for reproducible benchmark sampling, not security-sensitive
155 if seed is not None:
156 random.seed(seed)
157 formatted_questions = random.sample(
158 formatted_questions, num_examples
159 )
160 logger.info(
161 f"Sampled {num_examples} questions from xbench-DeepSearch (total available: {len(dataset)})"
162 )
163 else:
164 logger.info(
165 f"Loaded {len(formatted_questions)} questions from xbench-DeepSearch"
166 )
167 return formatted_questions
169 except Exception as e:
170 logger.warning(f"Failed to load via datasets library: {e}")
171 logger.info("Falling back to direct download")
172 return self._load_from_url(num_examples, seed)
174 def _load_from_url(
175 self, num_examples: int = None, seed: int = None
176 ) -> List[Dict[str, Any]]:
177 """Load dataset directly from URL without datasets library.
179 Args:
180 num_examples: Optional number of examples to limit
181 seed: Random seed for sampling
183 Returns:
184 List of questions from xbench-DeepSearch
185 """
186 import pandas as pd
188 try:
189 # Direct URL to the CSV file on Hugging Face
190 url = "https://huggingface.co/datasets/xbench/DeepSearch/resolve/main/data/train-00000-of-00001.parquet"
192 logger.info(f"Downloading xbench-DeepSearch from {url}")
193 df = pd.read_parquet(url)
195 # Convert to list of dicts and decrypt
196 questions = []
197 for _, row in df.iterrows():
198 # Get the canary key for decryption
199 canary = row.get("canary", "")
201 # Decrypt prompt and answer
202 prompt = row.get("prompt", "")
203 answer = row.get("answer", "")
205 try:
206 if prompt and all(
207 c
208 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
209 for c in prompt[:100]
210 ):
211 decrypted_prompt = self.xor_decrypt(
212 base64.b64decode(prompt), canary
213 ).decode("utf-8")
214 prompt = decrypted_prompt
216 if answer and all(
217 c
218 in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
219 for c in answer[:100]
220 ):
221 decrypted_answer = self.xor_decrypt(
222 base64.b64decode(answer), canary
223 ).decode("utf-8")
224 answer = decrypted_answer
225 except Exception as e:
226 logger.warning(f"Failed to decrypt item: {e}")
228 questions.append(
229 {
230 "id": row.get("id", f"xbench_{len(questions)}"),
231 "problem": prompt,
232 "answer": answer,
233 "reference_steps": row.get("reference_steps", ""),
234 "canary": canary,
235 }
236 )
238 # Apply sampling if num_examples is specified
239 if num_examples and num_examples < len(questions):
240 import random
242 # Use provided seed or None for random sampling
243 # Security: seeded random for reproducible benchmark sampling, not security-sensitive
244 if seed is not None:
245 random.seed(seed)
246 questions = random.sample(questions, num_examples)
247 logger.info(
248 f"Sampled {num_examples} questions via direct download (total available: {len(questions)})"
249 )
250 else:
251 logger.info(
252 f"Loaded {len(questions)} questions via direct download"
253 )
254 return questions
256 except Exception:
257 logger.exception("Failed to load dataset")
258 return []
260 def process_example(self, example: Dict[str, Any]) -> Dict[str, Any]:
261 """Process a single example from the dataset.
263 xbench-DeepSearch questions are designed for deep research evaluation.
264 """
265 processed = dict(example)
267 # Add evaluation metadata
268 processed["requires_deep_search"] = True
269 processed["expected_iterations"] = (
270 4 # Deep search questions need multiple iterations
271 )
273 # Evaluation criteria for research questions
274 processed["evaluation_criteria"] = {
275 "accuracy": 0.4,
276 "completeness": 0.3,
277 "reasoning": 0.2,
278 "sources": 0.1, # Credit for citing sources
279 }
281 return processed