Coverage for src / local_deep_research / web_search_engines / engines / search_engine_zenodo.py: 97%
169 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Zenodo search engine for open research data and publications."""
3import html
4import re
5from typing import Any, Dict, List, Optional
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...security.safe_requests import safe_get
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class ZenodoSearchEngine(BaseSearchEngine):
17 """
18 Zenodo search engine for open research data and publications.
20 Provides access to millions of research outputs including datasets,
21 software, publications, and more. No authentication required for search.
22 """
24 is_public = True
25 is_generic = False
26 is_scientific = True
27 is_code = False
28 is_lexical = True
29 needs_llm_relevance_filter = True
31 def __init__(
32 self,
33 max_results: int = 10,
34 resource_type: Optional[str] = None,
35 access_right: Optional[str] = None,
36 communities: Optional[str] = None,
37 sort: str = "bestmatch",
38 llm: Optional[BaseLLM] = None,
39 max_filtered_results: Optional[int] = None,
40 settings_snapshot: Optional[Dict[str, Any]] = None,
41 **kwargs,
42 ):
43 """
44 Initialize the Zenodo search engine.
46 Args:
47 max_results: Maximum number of search results
48 resource_type: Filter by type (dataset, software, publication, etc.)
49 access_right: Filter by access (open, closed, embargoed, restricted)
50 communities: Filter by Zenodo community
51 sort: Sort order (bestmatch, mostrecent, -mostrecent)
52 llm: Language model for relevance filtering
53 max_filtered_results: Maximum results after filtering
54 settings_snapshot: Settings snapshot for thread context
55 """
56 super().__init__(
57 llm=llm,
58 max_filtered_results=max_filtered_results,
59 max_results=max_results,
60 settings_snapshot=settings_snapshot,
61 **kwargs,
62 )
64 self.resource_type = resource_type
65 self.access_right = access_right
66 self.communities = communities
67 self.sort = sort
69 self.base_url = "https://zenodo.org"
70 self.search_url = f"{self.base_url}/api/records"
72 # User-Agent header for API requests
73 self.headers = {
74 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)"
75 }
77 def _build_query_params(self, query: str) -> Dict[str, Any]:
78 """Build query parameters for the API request."""
79 params = {
80 "q": query,
81 "size": self.max_results,
82 "sort": self.sort,
83 }
85 if self.resource_type:
86 params["type"] = self.resource_type
88 if self.access_right:
89 params["access_right"] = self.access_right
91 if self.communities:
92 params["communities"] = self.communities
94 return params
96 def _parse_creators(self, creators: List[Dict]) -> List[str]:
97 """Parse creator/author information."""
98 result = []
99 for creator in creators[:5]:
100 name = creator.get("name", "")
101 if name: 101 ↛ 99line 101 didn't jump to line 99 because the condition on line 101 was always true
102 result.append(name)
103 return result
105 def _get_resource_type_label(self, resource_type: Dict) -> str:
106 """Get human-readable resource type label."""
107 if not resource_type:
108 return "Unknown"
109 return (
110 resource_type.get("title") or resource_type.get("type") or "Unknown"
111 )
113 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
114 """
115 Get preview information for Zenodo records.
117 Args:
118 query: The search query
120 Returns:
121 List of preview dictionaries
122 """
123 logger.info(f"Getting Zenodo previews for query: {query}")
125 # Apply rate limiting
126 self._last_wait_time = self.rate_tracker.apply_rate_limit(
127 self.engine_type
128 )
130 try:
131 params = self._build_query_params(query)
132 response = safe_get(
133 self.search_url,
134 params=params,
135 headers=self.headers,
136 timeout=30,
137 )
139 self._raise_if_rate_limit(response.status_code)
141 response.raise_for_status()
142 data = response.json()
144 hits = data.get("hits", {})
145 results = hits.get("hits", [])
146 total = hits.get("total", 0)
147 logger.info(
148 f"Found {total} Zenodo results, returning {len(results)}"
149 )
151 previews = []
152 for record in results[: self.max_results]:
153 try:
154 record_id = record.get("id")
155 metadata = record.get("metadata", {})
157 title = metadata.get("title", "Untitled")
159 # Get creators
160 creators = self._parse_creators(
161 metadata.get("creators", [])
162 )
164 # Get description/abstract
165 description = metadata.get("description", "")
166 # Strip HTML tags and decode entities for snippet
167 if description:
168 description = html.unescape(
169 re.sub(r"<[^>]+>", "", description)
170 )
171 description = description[:500]
173 # Get DOI
174 doi = metadata.get("doi", "")
176 # Get publication date
177 pub_date = metadata.get("publication_date", "")
179 # Get resource type
180 resource_type = metadata.get("resource_type", {})
181 type_label = self._get_resource_type_label(resource_type)
183 # Get access right
184 access = metadata.get("access_right", "open")
186 # Get keywords
187 keywords = metadata.get("keywords", [])[:10]
189 # Get license
190 license_info = metadata.get("license", {})
191 license_id = (
192 license_info.get("id", "") if license_info else ""
193 )
195 # Get links
196 links = record.get("links", {})
197 record_url = links.get(
198 "self_html", f"{self.base_url}/records/{record_id}"
199 )
200 doi_url = links.get("doi", "")
202 # Build snippet
203 snippet_parts = []
204 if creators:
205 snippet_parts.append(f"By {', '.join(creators[:2])}")
206 if type_label: 206 ↛ 219line 206 didn't jump to line 219 because the condition on line 206 was always true
207 type_str = f"Type: {type_label}"
208 # Add access status and license inline
209 access_license = []
210 if access: 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was always true
211 access_license.append(
212 access.replace("_", " ").title()
213 )
214 if license_id:
215 access_license.append(license_id.upper())
216 if access_license: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true
217 type_str += f" ({', '.join(access_license)})"
218 snippet_parts.append(type_str)
219 if pub_date:
220 snippet_parts.append(f"Published: {pub_date}")
221 if description:
222 snippet_parts.append(description[:200])
223 snippet = ". ".join(snippet_parts)
225 preview = {
226 "id": str(record_id),
227 "title": title,
228 "link": record_url,
229 "snippet": snippet,
230 "authors": creators,
231 "doi": doi,
232 "doi_url": doi_url,
233 "publication_date": pub_date,
234 "resource_type": type_label,
235 "access_right": access,
236 "keywords": keywords,
237 "license": license_id,
238 "description": description,
239 "source": "Zenodo",
240 "_raw": record,
241 }
243 previews.append(preview)
245 except Exception:
246 logger.exception("Error parsing Zenodo record")
247 continue
249 return previews
251 except (requests.RequestException, ValueError) as e:
252 logger.exception("Zenodo API request failed")
253 self._raise_if_rate_limit(e)
254 return []
256 def _get_full_content(
257 self, relevant_items: List[Dict[str, Any]]
258 ) -> List[Dict[str, Any]]:
259 """
260 Get full content for the relevant Zenodo records.
262 Args:
263 relevant_items: List of relevant preview dictionaries
265 Returns:
266 List of result dictionaries with full content
267 """
268 logger.info(
269 f"Getting full content for {len(relevant_items)} Zenodo records"
270 )
272 results = []
273 for item in relevant_items:
274 result = item.copy()
276 raw = item.get("_raw", {})
277 if raw:
278 metadata = raw.get("metadata", {})
280 # Get full description (strip HTML tags and decode entities)
281 desc = metadata.get("description", "")
282 if desc:
283 desc = html.unescape(re.sub(r"<[^>]+>", "", desc))
284 result["description"] = desc
286 # Get all keywords
287 result["keywords"] = metadata.get("keywords", [])
289 # Get related identifiers
290 result["related_identifiers"] = metadata.get(
291 "related_identifiers", []
292 )
294 # Get files info
295 files = raw.get("files") or []
296 result["files"] = [
297 {
298 "filename": f.get("key", ""),
299 "size": f.get("size", 0),
300 "checksum": f.get("checksum", ""),
301 }
302 for f in files[:10]
303 ]
305 # Get references
306 result["references"] = metadata.get("references", [])
308 # Build content summary
309 content_parts = []
310 if result.get("authors"):
311 content_parts.append(
312 f"Authors: {', '.join(result['authors'])}"
313 )
314 if result.get("resource_type"):
315 content_parts.append(f"Type: {result['resource_type']}")
316 if result.get("publication_date"):
317 content_parts.append(
318 f"Published: {result['publication_date']}"
319 )
320 if result.get("doi"):
321 content_parts.append(f"DOI: {result['doi']}")
322 if result.get("keywords"):
323 content_parts.append(
324 f"Keywords: {', '.join(str(k) for k in result['keywords'][:5])}"
325 )
326 if result.get("license"):
327 content_parts.append(f"License: {result['license']}")
328 if result.get("description"):
329 content_parts.append(
330 f"\nDescription: {result['description'][:1000]}"
331 )
333 result["content"] = "\n".join(content_parts)
335 # Clean up internal fields
336 if "_raw" in result:
337 del result["_raw"]
339 results.append(result)
341 return results
343 def get_record(self, record_id: int) -> Optional[Dict[str, Any]]:
344 """
345 Get a specific record by Zenodo ID.
347 Args:
348 record_id: The Zenodo record ID
350 Returns:
351 Record dictionary or None
352 """
353 try:
354 url = f"{self.search_url}/{record_id}"
355 response = safe_get(url, headers=self.headers, timeout=30)
356 self._raise_if_rate_limit(response.status_code)
357 response.raise_for_status()
358 return response.json() # type: ignore[no-any-return]
359 except RateLimitError:
360 raise
361 except Exception:
362 logger.exception(f"Error fetching Zenodo record {record_id}")
363 return None
365 def search_datasets(self, query: str) -> List[Dict[str, Any]]:
366 """
367 Search specifically for datasets.
369 Args:
370 query: The search query
372 Returns:
373 List of matching datasets
374 """
375 original_type = self.resource_type
376 try:
377 self.resource_type = "dataset"
378 return self.run(query)
379 finally:
380 self.resource_type = original_type
382 def search_software(self, query: str) -> List[Dict[str, Any]]:
383 """
384 Search specifically for software.
386 Args:
387 query: The search query
389 Returns:
390 List of matching software records
391 """
392 original_type = self.resource_type
393 try:
394 self.resource_type = "software"
395 return self.run(query)
396 finally:
397 self.resource_type = original_type