Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""Zenodo search engine for open research data and publications."""

3import html

4import re

5from typing import Any, Dict, List, Optional

7import requests

8from langchain_core.language_models import BaseLLM

9from loguru import logger

11from ...constants import USER_AGENT

12from ...security.safe_requests import safe_get

13from ..rate_limiting import RateLimitError

14from ..search_engine_base import BaseSearchEngine

17class ZenodoSearchEngine(BaseSearchEngine):

18 """

19 Zenodo search engine for open research data and publications.

21 Provides access to millions of research outputs including datasets,

22 software, publications, and more. No authentication required for search.

23 """

25 is_public = True

26 is_generic = False

27 is_scientific = True

28 is_code = False

29 is_lexical = True

30 needs_llm_relevance_filter = True

32 def __init__(

33 self,

34 max_results: int = 10,

35 resource_type: Optional[str] = None,

36 access_right: Optional[str] = None,

37 communities: Optional[str] = None,

38 sort: str = "bestmatch",

39 llm: Optional[BaseLLM] = None,

40 max_filtered_results: Optional[int] = None,

41 settings_snapshot: Optional[Dict[str, Any]] = None,

42 **kwargs,

43 ):

44 """

45 Initialize the Zenodo search engine.

47 Args:

48 max_results: Maximum number of search results

49 resource_type: Filter by type (dataset, software, publication, etc.)

50 access_right: Filter by access (open, closed, embargoed, restricted)

51 communities: Filter by Zenodo community

52 sort: Sort order (bestmatch, mostrecent, -mostrecent)

53 llm: Language model for relevance filtering

54 max_filtered_results: Maximum results after filtering

55 settings_snapshot: Settings snapshot for thread context

56 """

57 super().__init__(

58 llm=llm,

59 max_filtered_results=max_filtered_results,

60 max_results=max_results,

61 settings_snapshot=settings_snapshot,

62 **kwargs,

63 )

65 self.resource_type = resource_type

66 self.access_right = access_right

67 self.communities = communities

68 self.sort = sort

70 self.base_url = "https://zenodo.org"

71 self.search_url = f"{self.base_url}/api/records"

73 # User-Agent header for API requests

74 self.headers = {"User-Agent": USER_AGENT}

76 def _build_query_params(self, query: str) -> Dict[str, Any]:

77 """Build query parameters for the API request."""

78 params = {

79 "q": query,

80 "size": self.max_results,

81 "sort": self.sort,

82 }

84 if self.resource_type:

85 params["type"] = self.resource_type

87 if self.access_right:

88 params["access_right"] = self.access_right

90 if self.communities:

91 params["communities"] = self.communities

93 return params

95 def _parse_creators(self, creators: List[Dict]) -> List[str]:

96 """Parse creator/author information."""

97 result = []

98 for creator in creators[:5]:

99 name = creator.get("name", "")

100 if name: 100 ↛ 98line 100 didn't jump to line 98 because the condition on line 100 was always true

101 result.append(name)

102 return result

103

104 def _get_resource_type_label(self, resource_type: Dict) -> str:

105 """Get human-readable resource type label."""

106 if not resource_type:

107 return "Unknown"

108 return (

109 resource_type.get("title") or resource_type.get("type") or "Unknown"

110 )

111

112 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

113 """

114 Get preview information for Zenodo records.

115

116 Args:

117 query: The search query

118

119 Returns:

120 List of preview dictionaries

121 """

122 logger.info(f"Getting Zenodo previews for query: {query}")

123

124 # Apply rate limiting

125 self._last_wait_time = self.rate_tracker.apply_rate_limit(

126 self.engine_type

127 )

128

129 try:

130 params = self._build_query_params(query)

131 response = safe_get(

132 self.search_url,

133 params=params,

134 headers=self.headers,

135 timeout=30,

136 )

137

138 self._raise_if_rate_limit(response.status_code)

139

140 response.raise_for_status()

141 data = response.json()

142

143 hits = data.get("hits", {})

144 results = hits.get("hits", [])

145 total = hits.get("total", 0)

146 logger.info(

147 f"Found {total} Zenodo results, returning {len(results)}"

148 )

149

150 previews = []

151 for record in results[: self.max_results]:

152 try:

153 record_id = record.get("id")

154 metadata = record.get("metadata", {})

155

156 title = metadata.get("title", "Untitled")

157

158 # Get creators

159 creators = self._parse_creators(

160 metadata.get("creators", [])

161 )

162

163 # Get description/abstract

164 description = metadata.get("description", "")

165 # Strip HTML tags and decode entities for snippet

166 if description:

167 description = html.unescape(

168 re.sub(r"<[^>]+>", "", description)

169 )

170 description = description[:500]

171

172 # Get DOI

173 doi = metadata.get("doi", "")

174

175 # Get publication date

176 pub_date = metadata.get("publication_date", "")

177

178 # Get resource type

179 resource_type = metadata.get("resource_type", {})

180 type_label = self._get_resource_type_label(resource_type)

181

182 # Get access right

183 access = metadata.get("access_right", "open")

184

185 # Get keywords

186 keywords = metadata.get("keywords", [])[:10]

187

188 # Get license

189 license_info = metadata.get("license", {})

190 license_id = (

191 license_info.get("id", "") if license_info else ""

192 )

193

194 # Get links

195 links = record.get("links", {})

196 record_url = links.get(

197 "self_html", f"{self.base_url}/records/{record_id}"

198 )

199 doi_url = links.get("doi", "")

200

201 # Build snippet

202 snippet_parts = []

203 if creators:

204 snippet_parts.append(f"By {', '.join(creators[:2])}")

205 if type_label: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true

206 type_str = f"Type: {type_label}"

207 # Add access status and license inline

208 access_license = []

209 if access: 209 ↛ 213line 209 didn't jump to line 213 because the condition on line 209 was always true

210 access_license.append(

211 access.replace("_", " ").title()

212 )

213 if license_id:

214 access_license.append(license_id.upper())

215 if access_license: 215 ↛ 217line 215 didn't jump to line 217 because the condition on line 215 was always true

216 type_str += f" ({', '.join(access_license)})"

217 snippet_parts.append(type_str)

218 if pub_date:

219 snippet_parts.append(f"Published: {pub_date}")

220 if description:

221 snippet_parts.append(description[:200])

222 snippet = ". ".join(snippet_parts)

223

224 preview = {

225 "id": str(record_id),

226 "title": title,

227 "link": record_url,

228 "snippet": snippet,

229 "authors": creators,

230 "doi": doi,

231 "doi_url": doi_url,

232 "publication_date": pub_date,

233 "resource_type": type_label,

234 "access_right": access,

235 "keywords": keywords,

236 "license": license_id,

237 "description": description,

238 "source": "Zenodo",

239 "_raw": record,

240 }

241

242 previews.append(preview)

243

244 except Exception:

245 logger.exception("Error parsing Zenodo record")

246 continue

247

248 return previews

249

250 except (requests.RequestException, ValueError) as e:

251 logger.exception("Zenodo API request failed")

252 self._raise_if_rate_limit(e)

253 return []

254

255 def _get_full_content(

256 self, relevant_items: List[Dict[str, Any]]

257 ) -> List[Dict[str, Any]]:

258 """

259 Get full content for the relevant Zenodo records.

260

261 Args:

262 relevant_items: List of relevant preview dictionaries

263

264 Returns:

265 List of result dictionaries with full content

266 """

267 logger.info(

268 f"Getting full content for {len(relevant_items)} Zenodo records"

269 )

270

271 results = []

272 for item in relevant_items:

273 result = item.copy()

274

275 raw = item.get("_raw", {})

276 if raw:

277 metadata = raw.get("metadata", {})

278

279 # Get full description (strip HTML tags and decode entities)

280 desc = metadata.get("description", "")

281 if desc:

282 desc = html.unescape(re.sub(r"<[^>]+>", "", desc))

283 result["description"] = desc

284

285 # Get all keywords

286 result["keywords"] = metadata.get("keywords", [])

287

288 # Get related identifiers

289 result["related_identifiers"] = metadata.get(

290 "related_identifiers", []

291 )

292

293 # Get files info

294 files = raw.get("files") or []

295 result["files"] = [

296 {

297 "filename": f.get("key", ""),

298 "size": f.get("size", 0),

299 "checksum": f.get("checksum", ""),

300 }

301 for f in files[:10]

302 ]

303

304 # Get references

305 result["references"] = metadata.get("references", [])

306

307 # Build content summary

308 content_parts = []

309 if result.get("authors"):

310 content_parts.append(

311 f"Authors: {', '.join(result['authors'])}"

312 )

313 if result.get("resource_type"):

314 content_parts.append(f"Type: {result['resource_type']}")

315 if result.get("publication_date"):

316 content_parts.append(

317 f"Published: {result['publication_date']}"

318 )

319 if result.get("doi"):

320 content_parts.append(f"DOI: {result['doi']}")

321 if result.get("keywords"):

322 content_parts.append(

323 f"Keywords: {', '.join(str(k) for k in result['keywords'][:5])}"

324 )

325 if result.get("license"):

326 content_parts.append(f"License: {result['license']}")

327 if result.get("description"):

328 content_parts.append(

329 f"\nDescription: {result['description'][:1000]}"

330 )

331

332 result["content"] = "\n".join(content_parts)

333

334 # Clean up internal fields

335 if "_raw" in result:

336 del result["_raw"]

337

338 results.append(result)

339

340 return results

341

342 def get_record(self, record_id: int) -> Optional[Dict[str, Any]]:

343 """

344 Get a specific record by Zenodo ID.

345

346 Args:

347 record_id: The Zenodo record ID

348

349 Returns:

350 Record dictionary or None

351 """

352 try:

353 url = f"{self.search_url}/{record_id}"

354 response = safe_get(url, headers=self.headers, timeout=30)

355 self._raise_if_rate_limit(response.status_code)

356 response.raise_for_status()

357 return response.json() # type: ignore[no-any-return]

358 except RateLimitError:

359 raise

360 except Exception:

361 logger.exception(f"Error fetching Zenodo record {record_id}")

362 return None

363

364 def search_datasets(self, query: str) -> List[Dict[str, Any]]:

365 """

366 Search specifically for datasets.

367

368 Args:

369 query: The search query

370

371 Returns:

372 List of matching datasets

373 """

374 original_type = self.resource_type

375 try:

376 self.resource_type = "dataset"

377 return self.run(query)

378 finally:

379 self.resource_type = original_type

380

381 def search_software(self, query: str) -> List[Dict[str, Any]]:

382 """

383 Search specifically for software.

384

385 Args:

386 query: The search query

387

388 Returns:

389 List of matching software records

390 """

391 original_type = self.resource_type

392 try:

393 self.resource_type = "software"

394 return self.run(query)

395 finally:

396 self.resource_type = original_type

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_zenodo.py: 97%

170 statements