Coverage for src/local_deep_research/web_search_engines/engines/search_engine_nasa

1"""NASA Astrophysics Data System (ADS) search engine implementation."""

3from typing import Any, Dict, List, Optional

5from langchain_core.language_models import BaseLLM

6from loguru import logger

8from ...constants import SNIPPET_LENGTH_LONG

9from ...advanced_search_system.filters.journal_reputation_filter import (

10 JournalReputationFilter,

11)

12from ...security.safe_requests import safe_get

13from ..rate_limiting import RateLimitError

14from ..search_engine_base import BaseSearchEngine

17class NasaAdsSearchEngine(BaseSearchEngine):

18 """NASA ADS search engine for physics, astronomy, and astrophysics papers."""

20 # Mark as public search engine

21 is_public = True

22 # Scientific/astronomy/astrophysics search engine

23 is_scientific = True

25 def __init__(

26 self,

27 max_results: int = 25,

28 api_key: Optional[str] = None,

29 sort_by: str = "relevance",

30 min_citations: int = 0,

31 from_publication_date: Optional[str] = None,

32 include_arxiv: bool = True,

33 llm: Optional[BaseLLM] = None,

34 max_filtered_results: Optional[int] = None,

35 settings_snapshot: Optional[Dict[str, Any]] = None,

36 **kwargs,

37 ):

38 """

39 Initialize the NASA ADS search engine.

41 Args:

42 max_results: Maximum number of search results

43 api_key: NASA ADS API key (required for higher rate limits)

44 sort_by: Sort order ('relevance', 'citation_count', 'date')

45 min_citations: Minimum citation count filter

46 from_publication_date: Filter papers from this date (YYYY-MM-DD)

47 include_arxiv: Include ArXiv preprints in results

48 llm: Language model for relevance filtering

49 max_filtered_results: Maximum number of results to keep after filtering

50 settings_snapshot: Settings snapshot for configuration

51 **kwargs: Additional parameters to pass to parent class

52 """

53 # Initialize journal reputation filter if needed

54 content_filters = []

55 journal_filter = JournalReputationFilter.create_default(

56 model=llm,

57 engine_name="nasa_ads",

58 settings_snapshot=settings_snapshot,

59 )

60 if journal_filter is not None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 content_filters.append(journal_filter)

63 # Initialize the BaseSearchEngine

64 super().__init__(

65 llm=llm,

66 max_filtered_results=max_filtered_results,

67 max_results=max_results,

68 content_filters=content_filters,

69 settings_snapshot=settings_snapshot,

70 **kwargs,

71 )

73 self.sort_by = sort_by

74 self.min_citations = min_citations

75 self.include_arxiv = include_arxiv

76 # Handle from_publication_date

77 self.from_publication_date = (

78 from_publication_date

79 if from_publication_date

80 and from_publication_date not in ["False", "false", ""]

81 else None

82 )

84 # Get API key from settings if not provided

85 if not api_key and settings_snapshot: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 from ...config.search_config import get_setting_from_snapshot

88 try:

89 api_key = get_setting_from_snapshot(

90 "search.engine.web.nasa_ads.api_key",

91 settings_snapshot=settings_snapshot,

92 )

93 except Exception:

94 pass

96 # Handle "False" string for api_key

97 self.api_key = (

98 api_key

99 if api_key and api_key not in ["False", "false", ""]

100 else None

101 )

102

103 # API configuration

104 self.api_base = "https://api.adsabs.harvard.edu/v1"

105 self.headers = {

106 "User-Agent": "Local-Deep-Research-Agent",

107 "Accept": "application/json",

108 }

109

110 if self.api_key:

111 self.headers["Authorization"] = f"Bearer {self.api_key}"

112 logger.info("Using NASA ADS with API key")

113 else:

114 logger.error(

115 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token"

116 )

117

118 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

119 """

120 Get preview information for NASA ADS search results.

121

122 Args:

123 query: The search query (natural language supported)

124

125 Returns:

126 List of preview dictionaries

127 """

128 logger.info(f"Searching NASA ADS for: {query}")

129

130 # Build the search query - NASA ADS has good natural language support

131 # We can use the query directly or enhance it slightly

132 search_query = query

133

134 # Build filters

135 filters = []

136 if self.from_publication_date:

137 # Convert YYYY-MM-DD to ADS format

138 try:

139 year = self.from_publication_date.split("-")[0]

140 if year.isdigit(): # Only add if it's a valid year 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true

141 filters.append(f"year:{year}-9999")

142 except Exception:

143 pass # Skip invalid date formats

144

145 if self.min_citations > 0:

146 filters.append(f"citation_count:[{self.min_citations} TO *]")

147

148 if not self.include_arxiv:

149 filters.append('-bibstem:"arXiv"')

150

151 # Combine query with filters

152 if filters:

153 full_query = f"{search_query} {' '.join(filters)}"

154 else:

155 full_query = search_query

156

157 # Build request parameters

158 params = {

159 "q": full_query,

160 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff",

161 "rows": min(

162 self.max_results, 200

163 ), # NASA ADS allows up to 200 per request

164 "start": 0,

165 }

166

167 # Add sorting

168 sort_map = {

169 "relevance": "score desc",

170 "citation_count": "citation_count desc",

171 "date": "date desc",

172 }

173 params["sort"] = sort_map.get(self.sort_by, "score desc")

174

175 try:

176 # Apply rate limiting (simple like PubMed)

177 self._last_wait_time = self.rate_tracker.apply_rate_limit(

178 self.engine_type

179 )

180 logger.debug(

181 f"Applied rate limit wait: {self._last_wait_time:.2f}s"

182 )

183

184 # Make the API request

185 logger.info(

186 f"Making NASA ADS API request with query: {params['q'][:100]}..."

187 )

188 response = safe_get(

189 f"{self.api_base}/search/query",

190 params=params,

191 headers=self.headers,

192 timeout=30,

193 )

194

195 # Log rate limit headers if available

196 if "X-RateLimit-Remaining" in response.headers: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 remaining = response.headers.get("X-RateLimit-Remaining")

198 limit = response.headers.get("X-RateLimit-Limit", "unknown")

199 logger.debug(

200 f"NASA ADS rate limit: {remaining}/{limit} requests remaining"

201 )

202

203 if response.status_code == 200:

204 data = response.json()

205 docs = data.get("response", {}).get("docs", [])

206 num_found = data.get("response", {}).get("numFound", 0)

207

208 logger.info(

209 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})"

210 )

211

212 # Format results as previews

213 previews = []

214 for doc in docs:

215 preview = self._format_doc_preview(doc)

216 if preview: 216 ↛ 214line 216 didn't jump to line 214 because the condition on line 216 was always true

217 previews.append(preview)

218

219 logger.info(f"Successfully formatted {len(previews)} previews")

220 return previews

221

222 elif response.status_code == 429:

223 # Rate limited

224 logger.warning("NASA ADS rate limit reached")

225 raise RateLimitError("NASA ADS rate limit exceeded")

226

227 elif response.status_code == 401:

228 logger.error("NASA ADS API key is invalid or missing")

229 return []

230

231 else:

232 logger.error(

233 f"NASA ADS API error: {response.status_code} - {response.text[:200]}"

234 )

235 return []

236

237 except RateLimitError:

238 # Re-raise rate limit errors for base class retry handling

239 raise

240 except Exception:

241 logger.exception("Error searching NASA ADS")

242 return []

243

244 def _format_doc_preview(

245 self, doc: Dict[str, Any]

246 ) -> Optional[Dict[str, Any]]:

247 """

248 Format a NASA ADS document as a preview dictionary.

249

250 Args:

251 doc: NASA ADS document object

252

253 Returns:

254 Formatted preview dictionary or None if formatting fails

255 """

256 try:

257 # Extract basic information

258 bibcode = doc.get("bibcode", "")

259 # Get title from list if available

260 title_list = doc.get("title", [])

261 title = title_list[0] if title_list else "No title"

262

263 # Get abstract or create snippet

264 abstract = doc.get("abstract", "")

265 snippet = (

266 abstract[:SNIPPET_LENGTH_LONG]

267 if abstract

268 else f"Academic paper: {title}"

269 )

270

271 # Get publication info

272 year = doc.get("year", "unknown")

273 pubdate = doc.get("pubdate", "unknown")

274

275 # Get journal/source

276 journal = "unknown"

277 if doc.get("pub"):

278 journal = doc.get("pub")

279 elif doc.get("bibstem"):

280 bibstem = doc.get("bibstem", [])

281 if bibstem: 281 ↛ 287line 281 didn't jump to line 287 because the condition on line 281 was always true

282 journal = (

283 bibstem[0] if isinstance(bibstem, list) else bibstem

284 )

285

286 # Get authors

287 authors = doc.get("author", [])

288 authors_str = ", ".join(authors[:5])

289 if len(authors) > 5:

290 authors_str += " et al."

291

292 # Get metrics

293 citation_count = doc.get("citation_count", 0)

294

295 # Get URL - prefer DOI, fallback to ADS URL

296 url = None

297 if doc.get("doi"):

298 dois = doc.get("doi", [])

299 if dois: 299 ↛ 303line 299 didn't jump to line 303 because the condition on line 299 was always true

300 doi = dois[0] if isinstance(dois, list) else dois

301 url = f"https://doi.org/{doi}"

302

303 if not url:

304 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}"

305

306 # Check if it's ArXiv

307 is_arxiv = "arXiv" in str(doc.get("bibstem", []))

308

309 # Get keywords

310 keywords = doc.get("keyword", [])

311

312 preview = {

313 "id": bibcode,

314 "title": title,

315 "link": url,

316 "snippet": snippet,

317 "authors": authors_str,

318 "year": year,

319 "date": pubdate,

320 "journal": journal,

321 "citations": citation_count,

322 "abstract": abstract,

323 "is_arxiv": is_arxiv,

324 "keywords": keywords[:5] if keywords else [],

325 "type": "academic_paper",

326 }

327

328 return preview

329

330 except Exception:

331 logger.exception(

332 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}"

333 )

334 return None

335

336 def _get_full_content(

337 self, relevant_items: List[Dict[str, Any]]

338 ) -> List[Dict[str, Any]]:

339 """

340 Get full content for relevant items (NASA ADS provides most content in preview).

341

342 Args:

343 relevant_items: List of relevant preview dictionaries

344

345 Returns:

346 List of result dictionaries with full content

347 """

348 # NASA ADS returns comprehensive data in the initial search,

349 # so we don't need a separate full content fetch

350 results = []

351 for item in relevant_items:

352 result = {

353 "title": item.get("title", ""),

354 "link": item.get("link", ""),

355 "snippet": item.get("snippet", ""),

356 "content": item.get("abstract", item.get("snippet", "")),

357 "metadata": {

358 "authors": item.get("authors", ""),

359 "year": item.get("year", ""),

360 "journal": item.get("journal", ""),

361 "citations": item.get("citations", 0),

362 "is_arxiv": item.get("is_arxiv", False),

363 "keywords": item.get("keywords", []),

364 },

365 }

366 results.append(result)

367

368 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 88%

131 statements