Coverage for src/local_deep_research/web_search_engines/engines/search_engine_zenodo.py: 97%
170 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Zenodo search engine for open research data and publications."""
3import html
4import re
5from typing import Any, Dict, List, Optional
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...constants import USER_AGENT
12from ...security.safe_requests import safe_get
13from ..rate_limiting import RateLimitError
14from ..search_engine_base import BaseSearchEngine
17class ZenodoSearchEngine(BaseSearchEngine):
18 """
19 Zenodo search engine for open research data and publications.
21 Provides access to millions of research outputs including datasets,
22 software, publications, and more. No authentication required for search.
23 """
25 is_public = True
26 is_generic = False
27 is_scientific = True
28 is_code = False
29 is_lexical = True
30 needs_llm_relevance_filter = True
32 def __init__(
33 self,
34 max_results: int = 10,
35 resource_type: Optional[str] = None,
36 access_right: Optional[str] = None,
37 communities: Optional[str] = None,
38 sort: str = "bestmatch",
39 llm: Optional[BaseLLM] = None,
40 max_filtered_results: Optional[int] = None,
41 settings_snapshot: Optional[Dict[str, Any]] = None,
42 **kwargs,
43 ):
44 """
45 Initialize the Zenodo search engine.
47 Args:
48 max_results: Maximum number of search results
49 resource_type: Filter by type (dataset, software, publication, etc.)
50 access_right: Filter by access (open, closed, embargoed, restricted)
51 communities: Filter by Zenodo community
52 sort: Sort order (bestmatch, mostrecent, -mostrecent)
53 llm: Language model for relevance filtering
54 max_filtered_results: Maximum results after filtering
55 settings_snapshot: Settings snapshot for thread context
56 """
57 super().__init__(
58 llm=llm,
59 max_filtered_results=max_filtered_results,
60 max_results=max_results,
61 settings_snapshot=settings_snapshot,
62 **kwargs,
63 )
65 self.resource_type = resource_type
66 self.access_right = access_right
67 self.communities = communities
68 self.sort = sort
70 self.base_url = "https://zenodo.org"
71 self.search_url = f"{self.base_url}/api/records"
73 # User-Agent header for API requests
74 self.headers = {"User-Agent": USER_AGENT}
76 def _build_query_params(self, query: str) -> Dict[str, Any]:
77 """Build query parameters for the API request."""
78 params = {
79 "q": query,
80 "size": self.max_results,
81 "sort": self.sort,
82 }
84 if self.resource_type:
85 params["type"] = self.resource_type
87 if self.access_right:
88 params["access_right"] = self.access_right
90 if self.communities:
91 params["communities"] = self.communities
93 return params
95 def _parse_creators(self, creators: List[Dict]) -> List[str]:
96 """Parse creator/author information."""
97 result = []
98 for creator in creators[:5]:
99 name = creator.get("name", "")
100 if name: 100 ↛ 98line 100 didn't jump to line 98 because the condition on line 100 was always true
101 result.append(name)
102 return result
104 def _get_resource_type_label(self, resource_type: Dict) -> str:
105 """Get human-readable resource type label."""
106 if not resource_type:
107 return "Unknown"
108 return (
109 resource_type.get("title") or resource_type.get("type") or "Unknown"
110 )
112 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
113 """
114 Get preview information for Zenodo records.
116 Args:
117 query: The search query
119 Returns:
120 List of preview dictionaries
121 """
122 logger.info(f"Getting Zenodo previews for query: {query}")
124 # Apply rate limiting
125 self._last_wait_time = self.rate_tracker.apply_rate_limit(
126 self.engine_type
127 )
129 try:
130 params = self._build_query_params(query)
131 response = safe_get(
132 self.search_url,
133 params=params,
134 headers=self.headers,
135 timeout=30,
136 )
138 self._raise_if_rate_limit(response.status_code)
140 response.raise_for_status()
141 data = response.json()
143 hits = data.get("hits", {})
144 results = hits.get("hits", [])
145 total = hits.get("total", 0)
146 logger.info(
147 f"Found {total} Zenodo results, returning {len(results)}"
148 )
150 previews = []
151 for record in results[: self.max_results]:
152 try:
153 record_id = record.get("id")
154 metadata = record.get("metadata", {})
156 title = metadata.get("title", "Untitled")
158 # Get creators
159 creators = self._parse_creators(
160 metadata.get("creators", [])
161 )
163 # Get description/abstract
164 description = metadata.get("description", "")
165 # Strip HTML tags and decode entities for snippet
166 if description:
167 description = html.unescape(
168 re.sub(r"<[^>]+>", "", description)
169 )
170 description = description[:500]
172 # Get DOI
173 doi = metadata.get("doi", "")
175 # Get publication date
176 pub_date = metadata.get("publication_date", "")
178 # Get resource type
179 resource_type = metadata.get("resource_type", {})
180 type_label = self._get_resource_type_label(resource_type)
182 # Get access right
183 access = metadata.get("access_right", "open")
185 # Get keywords
186 keywords = metadata.get("keywords", [])[:10]
188 # Get license
189 license_info = metadata.get("license", {})
190 license_id = (
191 license_info.get("id", "") if license_info else ""
192 )
194 # Get links
195 links = record.get("links", {})
196 record_url = links.get(
197 "self_html", f"{self.base_url}/records/{record_id}"
198 )
199 doi_url = links.get("doi", "")
201 # Build snippet
202 snippet_parts = []
203 if creators:
204 snippet_parts.append(f"By {', '.join(creators[:2])}")
205 if type_label: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true
206 type_str = f"Type: {type_label}"
207 # Add access status and license inline
208 access_license = []
209 if access: 209 ↛ 213line 209 didn't jump to line 213 because the condition on line 209 was always true
210 access_license.append(
211 access.replace("_", " ").title()
212 )
213 if license_id:
214 access_license.append(license_id.upper())
215 if access_license: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true
216 type_str += f" ({', '.join(access_license)})"
217 snippet_parts.append(type_str)
218 if pub_date:
219 snippet_parts.append(f"Published: {pub_date}")
220 if description:
221 snippet_parts.append(description[:200])
222 snippet = ". ".join(snippet_parts)
224 preview = {
225 "id": str(record_id),
226 "title": title,
227 "link": record_url,
228 "snippet": snippet,
229 "authors": creators,
230 "doi": doi,
231 "doi_url": doi_url,
232 "publication_date": pub_date,
233 "resource_type": type_label,
234 "access_right": access,
235 "keywords": keywords,
236 "license": license_id,
237 "description": description,
238 "source": "Zenodo",
239 "_raw": record,
240 }
242 previews.append(preview)
244 except Exception:
245 logger.exception("Error parsing Zenodo record")
246 continue
248 return previews
250 except (requests.RequestException, ValueError) as e:
251 logger.exception("Zenodo API request failed")
252 self._raise_if_rate_limit(e)
253 return []
255 def _get_full_content(
256 self, relevant_items: List[Dict[str, Any]]
257 ) -> List[Dict[str, Any]]:
258 """
259 Get full content for the relevant Zenodo records.
261 Args:
262 relevant_items: List of relevant preview dictionaries
264 Returns:
265 List of result dictionaries with full content
266 """
267 logger.info(
268 f"Getting full content for {len(relevant_items)} Zenodo records"
269 )
271 results = []
272 for item in relevant_items:
273 result = item.copy()
275 raw = item.get("_raw", {})
276 if raw:
277 metadata = raw.get("metadata", {})
279 # Get full description (strip HTML tags and decode entities)
280 desc = metadata.get("description", "")
281 if desc:
282 desc = html.unescape(re.sub(r"<[^>]+>", "", desc))
283 result["description"] = desc
285 # Get all keywords
286 result["keywords"] = metadata.get("keywords", [])
288 # Get related identifiers
289 result["related_identifiers"] = metadata.get(
290 "related_identifiers", []
291 )
293 # Get files info
294 files = raw.get("files") or []
295 result["files"] = [
296 {
297 "filename": f.get("key", ""),
298 "size": f.get("size", 0),
299 "checksum": f.get("checksum", ""),
300 }
301 for f in files[:10]
302 ]
304 # Get references
305 result["references"] = metadata.get("references", [])
307 # Build content summary
308 content_parts = []
309 if result.get("authors"):
310 content_parts.append(
311 f"Authors: {', '.join(result['authors'])}"
312 )
313 if result.get("resource_type"):
314 content_parts.append(f"Type: {result['resource_type']}")
315 if result.get("publication_date"):
316 content_parts.append(
317 f"Published: {result['publication_date']}"
318 )
319 if result.get("doi"):
320 content_parts.append(f"DOI: {result['doi']}")
321 if result.get("keywords"):
322 content_parts.append(
323 f"Keywords: {', '.join(str(k) for k in result['keywords'][:5])}"
324 )
325 if result.get("license"):
326 content_parts.append(f"License: {result['license']}")
327 if result.get("description"):
328 content_parts.append(
329 f"\nDescription: {result['description'][:1000]}"
330 )
332 result["content"] = "\n".join(content_parts)
334 # Clean up internal fields
335 if "_raw" in result:
336 del result["_raw"]
338 results.append(result)
340 return results
342 def get_record(self, record_id: int) -> Optional[Dict[str, Any]]:
343 """
344 Get a specific record by Zenodo ID.
346 Args:
347 record_id: The Zenodo record ID
349 Returns:
350 Record dictionary or None
351 """
352 try:
353 url = f"{self.search_url}/{record_id}"
354 response = safe_get(url, headers=self.headers, timeout=30)
355 self._raise_if_rate_limit(response.status_code)
356 response.raise_for_status()
357 return response.json() # type: ignore[no-any-return]
358 except RateLimitError:
359 raise
360 except Exception:
361 logger.exception(f"Error fetching Zenodo record {record_id}")
362 return None
364 def search_datasets(self, query: str) -> List[Dict[str, Any]]:
365 """
366 Search specifically for datasets.
368 Args:
369 query: The search query
371 Returns:
372 List of matching datasets
373 """
374 original_type = self.resource_type
375 try:
376 self.resource_type = "dataset"
377 return self.run(query)
378 finally:
379 self.resource_type = original_type
381 def search_software(self, query: str) -> List[Dict[str, Any]]:
382 """
383 Search specifically for software.
385 Args:
386 query: The search query
388 Returns:
389 List of matching software records
390 """
391 original_type = self.resource_type
392 try:
393 self.resource_type = "software"
394 return self.run(query)
395 finally:
396 self.resource_type = original_type