Coverage for src/local_deep_research/web_search_engines/engines/search_engine_zenodo.py: 97%

170 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Zenodo search engine for open research data and publications.""" 

2 

3import html 

4import re 

5from typing import Any, Dict, List, Optional 

6 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...constants import USER_AGENT 

12from ...security.safe_requests import safe_get 

13from ..rate_limiting import RateLimitError 

14from ..search_engine_base import BaseSearchEngine 

15 

16 

17class ZenodoSearchEngine(BaseSearchEngine): 

18 """ 

19 Zenodo search engine for open research data and publications. 

20 

21 Provides access to millions of research outputs including datasets, 

22 software, publications, and more. No authentication required for search. 

23 """ 

24 

25 is_public = True 

26 is_generic = False 

27 is_scientific = True 

28 is_code = False 

29 is_lexical = True 

30 needs_llm_relevance_filter = True 

31 

32 def __init__( 

33 self, 

34 max_results: int = 10, 

35 resource_type: Optional[str] = None, 

36 access_right: Optional[str] = None, 

37 communities: Optional[str] = None, 

38 sort: str = "bestmatch", 

39 llm: Optional[BaseLLM] = None, 

40 max_filtered_results: Optional[int] = None, 

41 settings_snapshot: Optional[Dict[str, Any]] = None, 

42 **kwargs, 

43 ): 

44 """ 

45 Initialize the Zenodo search engine. 

46 

47 Args: 

48 max_results: Maximum number of search results 

49 resource_type: Filter by type (dataset, software, publication, etc.) 

50 access_right: Filter by access (open, closed, embargoed, restricted) 

51 communities: Filter by Zenodo community 

52 sort: Sort order (bestmatch, mostrecent, -mostrecent) 

53 llm: Language model for relevance filtering 

54 max_filtered_results: Maximum results after filtering 

55 settings_snapshot: Settings snapshot for thread context 

56 """ 

57 super().__init__( 

58 llm=llm, 

59 max_filtered_results=max_filtered_results, 

60 max_results=max_results, 

61 settings_snapshot=settings_snapshot, 

62 **kwargs, 

63 ) 

64 

65 self.resource_type = resource_type 

66 self.access_right = access_right 

67 self.communities = communities 

68 self.sort = sort 

69 

70 self.base_url = "https://zenodo.org" 

71 self.search_url = f"{self.base_url}/api/records" 

72 

73 # User-Agent header for API requests 

74 self.headers = {"User-Agent": USER_AGENT} 

75 

76 def _build_query_params(self, query: str) -> Dict[str, Any]: 

77 """Build query parameters for the API request.""" 

78 params = { 

79 "q": query, 

80 "size": self.max_results, 

81 "sort": self.sort, 

82 } 

83 

84 if self.resource_type: 

85 params["type"] = self.resource_type 

86 

87 if self.access_right: 

88 params["access_right"] = self.access_right 

89 

90 if self.communities: 

91 params["communities"] = self.communities 

92 

93 return params 

94 

95 def _parse_creators(self, creators: List[Dict]) -> List[str]: 

96 """Parse creator/author information.""" 

97 result = [] 

98 for creator in creators[:5]: 

99 name = creator.get("name", "") 

100 if name: 100 ↛ 98line 100 didn't jump to line 98 because the condition on line 100 was always true

101 result.append(name) 

102 return result 

103 

104 def _get_resource_type_label(self, resource_type: Dict) -> str: 

105 """Get human-readable resource type label.""" 

106 if not resource_type: 

107 return "Unknown" 

108 return ( 

109 resource_type.get("title") or resource_type.get("type") or "Unknown" 

110 ) 

111 

112 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

113 """ 

114 Get preview information for Zenodo records. 

115 

116 Args: 

117 query: The search query 

118 

119 Returns: 

120 List of preview dictionaries 

121 """ 

122 logger.info(f"Getting Zenodo previews for query: {query}") 

123 

124 # Apply rate limiting 

125 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

126 self.engine_type 

127 ) 

128 

129 try: 

130 params = self._build_query_params(query) 

131 response = safe_get( 

132 self.search_url, 

133 params=params, 

134 headers=self.headers, 

135 timeout=30, 

136 ) 

137 

138 self._raise_if_rate_limit(response.status_code) 

139 

140 response.raise_for_status() 

141 data = response.json() 

142 

143 hits = data.get("hits", {}) 

144 results = hits.get("hits", []) 

145 total = hits.get("total", 0) 

146 logger.info( 

147 f"Found {total} Zenodo results, returning {len(results)}" 

148 ) 

149 

150 previews = [] 

151 for record in results[: self.max_results]: 

152 try: 

153 record_id = record.get("id") 

154 metadata = record.get("metadata", {}) 

155 

156 title = metadata.get("title", "Untitled") 

157 

158 # Get creators 

159 creators = self._parse_creators( 

160 metadata.get("creators", []) 

161 ) 

162 

163 # Get description/abstract 

164 description = metadata.get("description", "") 

165 # Strip HTML tags and decode entities for snippet 

166 if description: 

167 description = html.unescape( 

168 re.sub(r"<[^>]+>", "", description) 

169 ) 

170 description = description[:500] 

171 

172 # Get DOI 

173 doi = metadata.get("doi", "") 

174 

175 # Get publication date 

176 pub_date = metadata.get("publication_date", "") 

177 

178 # Get resource type 

179 resource_type = metadata.get("resource_type", {}) 

180 type_label = self._get_resource_type_label(resource_type) 

181 

182 # Get access right 

183 access = metadata.get("access_right", "open") 

184 

185 # Get keywords 

186 keywords = metadata.get("keywords", [])[:10] 

187 

188 # Get license 

189 license_info = metadata.get("license", {}) 

190 license_id = ( 

191 license_info.get("id", "") if license_info else "" 

192 ) 

193 

194 # Get links 

195 links = record.get("links", {}) 

196 record_url = links.get( 

197 "self_html", f"{self.base_url}/records/{record_id}" 

198 ) 

199 doi_url = links.get("doi", "") 

200 

201 # Build snippet 

202 snippet_parts = [] 

203 if creators: 

204 snippet_parts.append(f"By {', '.join(creators[:2])}") 

205 if type_label: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true

206 type_str = f"Type: {type_label}" 

207 # Add access status and license inline 

208 access_license = [] 

209 if access: 209 ↛ 213line 209 didn't jump to line 213 because the condition on line 209 was always true

210 access_license.append( 

211 access.replace("_", " ").title() 

212 ) 

213 if license_id: 

214 access_license.append(license_id.upper()) 

215 if access_license: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true

216 type_str += f" ({', '.join(access_license)})" 

217 snippet_parts.append(type_str) 

218 if pub_date: 

219 snippet_parts.append(f"Published: {pub_date}") 

220 if description: 

221 snippet_parts.append(description[:200]) 

222 snippet = ". ".join(snippet_parts) 

223 

224 preview = { 

225 "id": str(record_id), 

226 "title": title, 

227 "link": record_url, 

228 "snippet": snippet, 

229 "authors": creators, 

230 "doi": doi, 

231 "doi_url": doi_url, 

232 "publication_date": pub_date, 

233 "resource_type": type_label, 

234 "access_right": access, 

235 "keywords": keywords, 

236 "license": license_id, 

237 "description": description, 

238 "source": "Zenodo", 

239 "_raw": record, 

240 } 

241 

242 previews.append(preview) 

243 

244 except Exception: 

245 logger.exception("Error parsing Zenodo record") 

246 continue 

247 

248 return previews 

249 

250 except (requests.RequestException, ValueError) as e: 

251 logger.exception("Zenodo API request failed") 

252 self._raise_if_rate_limit(e) 

253 return [] 

254 

255 def _get_full_content( 

256 self, relevant_items: List[Dict[str, Any]] 

257 ) -> List[Dict[str, Any]]: 

258 """ 

259 Get full content for the relevant Zenodo records. 

260 

261 Args: 

262 relevant_items: List of relevant preview dictionaries 

263 

264 Returns: 

265 List of result dictionaries with full content 

266 """ 

267 logger.info( 

268 f"Getting full content for {len(relevant_items)} Zenodo records" 

269 ) 

270 

271 results = [] 

272 for item in relevant_items: 

273 result = item.copy() 

274 

275 raw = item.get("_raw", {}) 

276 if raw: 

277 metadata = raw.get("metadata", {}) 

278 

279 # Get full description (strip HTML tags and decode entities) 

280 desc = metadata.get("description", "") 

281 if desc: 

282 desc = html.unescape(re.sub(r"<[^>]+>", "", desc)) 

283 result["description"] = desc 

284 

285 # Get all keywords 

286 result["keywords"] = metadata.get("keywords", []) 

287 

288 # Get related identifiers 

289 result["related_identifiers"] = metadata.get( 

290 "related_identifiers", [] 

291 ) 

292 

293 # Get files info 

294 files = raw.get("files") or [] 

295 result["files"] = [ 

296 { 

297 "filename": f.get("key", ""), 

298 "size": f.get("size", 0), 

299 "checksum": f.get("checksum", ""), 

300 } 

301 for f in files[:10] 

302 ] 

303 

304 # Get references 

305 result["references"] = metadata.get("references", []) 

306 

307 # Build content summary 

308 content_parts = [] 

309 if result.get("authors"): 

310 content_parts.append( 

311 f"Authors: {', '.join(result['authors'])}" 

312 ) 

313 if result.get("resource_type"): 

314 content_parts.append(f"Type: {result['resource_type']}") 

315 if result.get("publication_date"): 

316 content_parts.append( 

317 f"Published: {result['publication_date']}" 

318 ) 

319 if result.get("doi"): 

320 content_parts.append(f"DOI: {result['doi']}") 

321 if result.get("keywords"): 

322 content_parts.append( 

323 f"Keywords: {', '.join(str(k) for k in result['keywords'][:5])}" 

324 ) 

325 if result.get("license"): 

326 content_parts.append(f"License: {result['license']}") 

327 if result.get("description"): 

328 content_parts.append( 

329 f"\nDescription: {result['description'][:1000]}" 

330 ) 

331 

332 result["content"] = "\n".join(content_parts) 

333 

334 # Clean up internal fields 

335 if "_raw" in result: 

336 del result["_raw"] 

337 

338 results.append(result) 

339 

340 return results 

341 

342 def get_record(self, record_id: int) -> Optional[Dict[str, Any]]: 

343 """ 

344 Get a specific record by Zenodo ID. 

345 

346 Args: 

347 record_id: The Zenodo record ID 

348 

349 Returns: 

350 Record dictionary or None 

351 """ 

352 try: 

353 url = f"{self.search_url}/{record_id}" 

354 response = safe_get(url, headers=self.headers, timeout=30) 

355 self._raise_if_rate_limit(response.status_code) 

356 response.raise_for_status() 

357 return response.json() # type: ignore[no-any-return] 

358 except RateLimitError: 

359 raise 

360 except Exception: 

361 logger.exception(f"Error fetching Zenodo record {record_id}") 

362 return None 

363 

364 def search_datasets(self, query: str) -> List[Dict[str, Any]]: 

365 """ 

366 Search specifically for datasets. 

367 

368 Args: 

369 query: The search query 

370 

371 Returns: 

372 List of matching datasets 

373 """ 

374 original_type = self.resource_type 

375 try: 

376 self.resource_type = "dataset" 

377 return self.run(query) 

378 finally: 

379 self.resource_type = original_type 

380 

381 def search_software(self, query: str) -> List[Dict[str, Any]]: 

382 """ 

383 Search specifically for software. 

384 

385 Args: 

386 query: The search query 

387 

388 Returns: 

389 List of matching software records 

390 """ 

391 original_type = self.resource_type 

392 try: 

393 self.resource_type = "software" 

394 return self.run(query) 

395 finally: 

396 self.resource_type = original_type