Coverage for src / local_deep_research / web_search_engines / engines / search_engine_zenodo.py: 97%

169 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Zenodo search engine for open research data and publications.""" 

2 

3import html 

4import re 

5from typing import Any, Dict, List, Optional 

6 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...security.safe_requests import safe_get 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class ZenodoSearchEngine(BaseSearchEngine): 

17 """ 

18 Zenodo search engine for open research data and publications. 

19 

20 Provides access to millions of research outputs including datasets, 

21 software, publications, and more. No authentication required for search. 

22 """ 

23 

24 is_public = True 

25 is_generic = False 

26 is_scientific = True 

27 is_code = False 

28 is_lexical = True 

29 needs_llm_relevance_filter = True 

30 

31 def __init__( 

32 self, 

33 max_results: int = 10, 

34 resource_type: Optional[str] = None, 

35 access_right: Optional[str] = None, 

36 communities: Optional[str] = None, 

37 sort: str = "bestmatch", 

38 llm: Optional[BaseLLM] = None, 

39 max_filtered_results: Optional[int] = None, 

40 settings_snapshot: Optional[Dict[str, Any]] = None, 

41 **kwargs, 

42 ): 

43 """ 

44 Initialize the Zenodo search engine. 

45 

46 Args: 

47 max_results: Maximum number of search results 

48 resource_type: Filter by type (dataset, software, publication, etc.) 

49 access_right: Filter by access (open, closed, embargoed, restricted) 

50 communities: Filter by Zenodo community 

51 sort: Sort order (bestmatch, mostrecent, -mostrecent) 

52 llm: Language model for relevance filtering 

53 max_filtered_results: Maximum results after filtering 

54 settings_snapshot: Settings snapshot for thread context 

55 """ 

56 super().__init__( 

57 llm=llm, 

58 max_filtered_results=max_filtered_results, 

59 max_results=max_results, 

60 settings_snapshot=settings_snapshot, 

61 **kwargs, 

62 ) 

63 

64 self.resource_type = resource_type 

65 self.access_right = access_right 

66 self.communities = communities 

67 self.sort = sort 

68 

69 self.base_url = "https://zenodo.org" 

70 self.search_url = f"{self.base_url}/api/records" 

71 

72 # User-Agent header for API requests 

73 self.headers = { 

74 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)" 

75 } 

76 

77 def _build_query_params(self, query: str) -> Dict[str, Any]: 

78 """Build query parameters for the API request.""" 

79 params = { 

80 "q": query, 

81 "size": self.max_results, 

82 "sort": self.sort, 

83 } 

84 

85 if self.resource_type: 

86 params["type"] = self.resource_type 

87 

88 if self.access_right: 

89 params["access_right"] = self.access_right 

90 

91 if self.communities: 

92 params["communities"] = self.communities 

93 

94 return params 

95 

96 def _parse_creators(self, creators: List[Dict]) -> List[str]: 

97 """Parse creator/author information.""" 

98 result = [] 

99 for creator in creators[:5]: 

100 name = creator.get("name", "") 

101 if name: 101 ↛ 99line 101 didn't jump to line 99 because the condition on line 101 was always true

102 result.append(name) 

103 return result 

104 

105 def _get_resource_type_label(self, resource_type: Dict) -> str: 

106 """Get human-readable resource type label.""" 

107 if not resource_type: 

108 return "Unknown" 

109 return ( 

110 resource_type.get("title") or resource_type.get("type") or "Unknown" 

111 ) 

112 

113 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

114 """ 

115 Get preview information for Zenodo records. 

116 

117 Args: 

118 query: The search query 

119 

120 Returns: 

121 List of preview dictionaries 

122 """ 

123 logger.info(f"Getting Zenodo previews for query: {query}") 

124 

125 # Apply rate limiting 

126 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

127 self.engine_type 

128 ) 

129 

130 try: 

131 params = self._build_query_params(query) 

132 response = safe_get( 

133 self.search_url, 

134 params=params, 

135 headers=self.headers, 

136 timeout=30, 

137 ) 

138 

139 self._raise_if_rate_limit(response.status_code) 

140 

141 response.raise_for_status() 

142 data = response.json() 

143 

144 hits = data.get("hits", {}) 

145 results = hits.get("hits", []) 

146 total = hits.get("total", 0) 

147 logger.info( 

148 f"Found {total} Zenodo results, returning {len(results)}" 

149 ) 

150 

151 previews = [] 

152 for record in results[: self.max_results]: 

153 try: 

154 record_id = record.get("id") 

155 metadata = record.get("metadata", {}) 

156 

157 title = metadata.get("title", "Untitled") 

158 

159 # Get creators 

160 creators = self._parse_creators( 

161 metadata.get("creators", []) 

162 ) 

163 

164 # Get description/abstract 

165 description = metadata.get("description", "") 

166 # Strip HTML tags and decode entities for snippet 

167 if description: 

168 description = html.unescape( 

169 re.sub(r"<[^>]+>", "", description) 

170 ) 

171 description = description[:500] 

172 

173 # Get DOI 

174 doi = metadata.get("doi", "") 

175 

176 # Get publication date 

177 pub_date = metadata.get("publication_date", "") 

178 

179 # Get resource type 

180 resource_type = metadata.get("resource_type", {}) 

181 type_label = self._get_resource_type_label(resource_type) 

182 

183 # Get access right 

184 access = metadata.get("access_right", "open") 

185 

186 # Get keywords 

187 keywords = metadata.get("keywords", [])[:10] 

188 

189 # Get license 

190 license_info = metadata.get("license", {}) 

191 license_id = ( 

192 license_info.get("id", "") if license_info else "" 

193 ) 

194 

195 # Get links 

196 links = record.get("links", {}) 

197 record_url = links.get( 

198 "self_html", f"{self.base_url}/records/{record_id}" 

199 ) 

200 doi_url = links.get("doi", "") 

201 

202 # Build snippet 

203 snippet_parts = [] 

204 if creators: 

205 snippet_parts.append(f"By {', '.join(creators[:2])}") 

206 if type_label: 206 ↛ 219line 206 didn't jump to line 219 because the condition on line 206 was always true

207 type_str = f"Type: {type_label}" 

208 # Add access status and license inline 

209 access_license = [] 

210 if access: 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was always true

211 access_license.append( 

212 access.replace("_", " ").title() 

213 ) 

214 if license_id: 

215 access_license.append(license_id.upper()) 

216 if access_license: 216 ↛ 218line 216 didn't jump to line 218 because the condition on line 216 was always true

217 type_str += f" ({', '.join(access_license)})" 

218 snippet_parts.append(type_str) 

219 if pub_date: 

220 snippet_parts.append(f"Published: {pub_date}") 

221 if description: 

222 snippet_parts.append(description[:200]) 

223 snippet = ". ".join(snippet_parts) 

224 

225 preview = { 

226 "id": str(record_id), 

227 "title": title, 

228 "link": record_url, 

229 "snippet": snippet, 

230 "authors": creators, 

231 "doi": doi, 

232 "doi_url": doi_url, 

233 "publication_date": pub_date, 

234 "resource_type": type_label, 

235 "access_right": access, 

236 "keywords": keywords, 

237 "license": license_id, 

238 "description": description, 

239 "source": "Zenodo", 

240 "_raw": record, 

241 } 

242 

243 previews.append(preview) 

244 

245 except Exception: 

246 logger.exception("Error parsing Zenodo record") 

247 continue 

248 

249 return previews 

250 

251 except (requests.RequestException, ValueError) as e: 

252 logger.exception("Zenodo API request failed") 

253 self._raise_if_rate_limit(e) 

254 return [] 

255 

256 def _get_full_content( 

257 self, relevant_items: List[Dict[str, Any]] 

258 ) -> List[Dict[str, Any]]: 

259 """ 

260 Get full content for the relevant Zenodo records. 

261 

262 Args: 

263 relevant_items: List of relevant preview dictionaries 

264 

265 Returns: 

266 List of result dictionaries with full content 

267 """ 

268 logger.info( 

269 f"Getting full content for {len(relevant_items)} Zenodo records" 

270 ) 

271 

272 results = [] 

273 for item in relevant_items: 

274 result = item.copy() 

275 

276 raw = item.get("_raw", {}) 

277 if raw: 

278 metadata = raw.get("metadata", {}) 

279 

280 # Get full description (strip HTML tags and decode entities) 

281 desc = metadata.get("description", "") 

282 if desc: 

283 desc = html.unescape(re.sub(r"<[^>]+>", "", desc)) 

284 result["description"] = desc 

285 

286 # Get all keywords 

287 result["keywords"] = metadata.get("keywords", []) 

288 

289 # Get related identifiers 

290 result["related_identifiers"] = metadata.get( 

291 "related_identifiers", [] 

292 ) 

293 

294 # Get files info 

295 files = raw.get("files") or [] 

296 result["files"] = [ 

297 { 

298 "filename": f.get("key", ""), 

299 "size": f.get("size", 0), 

300 "checksum": f.get("checksum", ""), 

301 } 

302 for f in files[:10] 

303 ] 

304 

305 # Get references 

306 result["references"] = metadata.get("references", []) 

307 

308 # Build content summary 

309 content_parts = [] 

310 if result.get("authors"): 

311 content_parts.append( 

312 f"Authors: {', '.join(result['authors'])}" 

313 ) 

314 if result.get("resource_type"): 

315 content_parts.append(f"Type: {result['resource_type']}") 

316 if result.get("publication_date"): 

317 content_parts.append( 

318 f"Published: {result['publication_date']}" 

319 ) 

320 if result.get("doi"): 

321 content_parts.append(f"DOI: {result['doi']}") 

322 if result.get("keywords"): 

323 content_parts.append( 

324 f"Keywords: {', '.join(str(k) for k in result['keywords'][:5])}" 

325 ) 

326 if result.get("license"): 

327 content_parts.append(f"License: {result['license']}") 

328 if result.get("description"): 

329 content_parts.append( 

330 f"\nDescription: {result['description'][:1000]}" 

331 ) 

332 

333 result["content"] = "\n".join(content_parts) 

334 

335 # Clean up internal fields 

336 if "_raw" in result: 

337 del result["_raw"] 

338 

339 results.append(result) 

340 

341 return results 

342 

343 def get_record(self, record_id: int) -> Optional[Dict[str, Any]]: 

344 """ 

345 Get a specific record by Zenodo ID. 

346 

347 Args: 

348 record_id: The Zenodo record ID 

349 

350 Returns: 

351 Record dictionary or None 

352 """ 

353 try: 

354 url = f"{self.search_url}/{record_id}" 

355 response = safe_get(url, headers=self.headers, timeout=30) 

356 self._raise_if_rate_limit(response.status_code) 

357 response.raise_for_status() 

358 return response.json() # type: ignore[no-any-return] 

359 except RateLimitError: 

360 raise 

361 except Exception: 

362 logger.exception(f"Error fetching Zenodo record {record_id}") 

363 return None 

364 

365 def search_datasets(self, query: str) -> List[Dict[str, Any]]: 

366 """ 

367 Search specifically for datasets. 

368 

369 Args: 

370 query: The search query 

371 

372 Returns: 

373 List of matching datasets 

374 """ 

375 original_type = self.resource_type 

376 try: 

377 self.resource_type = "dataset" 

378 return self.run(query) 

379 finally: 

380 self.resource_type = original_type 

381 

382 def search_software(self, query: str) -> List[Dict[str, Any]]: 

383 """ 

384 Search specifically for software. 

385 

386 Args: 

387 query: The search query 

388 

389 Returns: 

390 List of matching software records 

391 """ 

392 original_type = self.resource_type 

393 try: 

394 self.resource_type = "software" 

395 return self.run(query) 

396 finally: 

397 self.resource_type = original_type