Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 89%

132 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""NASA Astrophysics Data System (ADS) search engine implementation.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...constants import SNIPPET_LENGTH_LONG 

9from ...advanced_search_system.filters.journal_reputation_filter import ( 

10 JournalReputationFilter, 

11) 

12from ...security.safe_requests import safe_get 

13from ..rate_limiting import RateLimitError 

14from ..search_engine_base import BaseSearchEngine 

15 

16 

17class NasaAdsSearchEngine(BaseSearchEngine): 

18 """NASA ADS search engine for physics, astronomy, and astrophysics papers.""" 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Scientific/astronomy/astrophysics search engine 

23 is_scientific = True 

24 is_lexical = True 

25 needs_llm_relevance_filter = True 

26 

27 def __init__( 

28 self, 

29 max_results: int = 25, 

30 api_key: Optional[str] = None, 

31 sort_by: str = "relevance", 

32 min_citations: int = 0, 

33 from_publication_date: Optional[str] = None, 

34 include_arxiv: bool = True, 

35 llm: Optional[BaseLLM] = None, 

36 max_filtered_results: Optional[int] = None, 

37 settings_snapshot: Optional[Dict[str, Any]] = None, 

38 **kwargs, 

39 ): 

40 """ 

41 Initialize the NASA ADS search engine. 

42 

43 Args: 

44 max_results: Maximum number of search results 

45 api_key: NASA ADS API key (required for higher rate limits) 

46 sort_by: Sort order ('relevance', 'citation_count', 'date') 

47 min_citations: Minimum citation count filter 

48 from_publication_date: Filter papers from this date (YYYY-MM-DD) 

49 include_arxiv: Include ArXiv preprints in results 

50 llm: Language model for relevance filtering 

51 max_filtered_results: Maximum number of results to keep after filtering 

52 settings_snapshot: Settings snapshot for configuration 

53 **kwargs: Additional parameters to pass to parent class 

54 """ 

55 # Initialize journal reputation filter if needed 

56 content_filters = [] 

57 journal_filter = JournalReputationFilter.create_default( 

58 model=llm, # type: ignore[arg-type] 

59 engine_name="nasa_ads", 

60 settings_snapshot=settings_snapshot, 

61 ) 

62 if journal_filter is not None: 

63 content_filters.append(journal_filter) 

64 

65 # Initialize the BaseSearchEngine 

66 super().__init__( 

67 llm=llm, 

68 max_filtered_results=max_filtered_results, 

69 max_results=max_results, 

70 content_filters=content_filters, # type: ignore[arg-type] 

71 settings_snapshot=settings_snapshot, 

72 **kwargs, 

73 ) 

74 

75 self.sort_by = sort_by 

76 self.min_citations = min_citations 

77 self.include_arxiv = include_arxiv 

78 # Handle from_publication_date 

79 self.from_publication_date = ( 

80 from_publication_date 

81 if from_publication_date 

82 and from_publication_date not in ["False", "false", ""] 

83 else None 

84 ) 

85 

86 # Get API key from settings if not provided 

87 if not api_key and settings_snapshot: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 from ...config.search_config import get_setting_from_snapshot 

89 

90 try: 

91 api_key = get_setting_from_snapshot( 

92 "search.engine.web.nasa_ads.api_key", 

93 settings_snapshot=settings_snapshot, 

94 ) 

95 except Exception: 

96 logger.debug( 

97 "Failed to read nasa_ads.api_key from settings snapshot", 

98 exc_info=True, 

99 ) 

100 

101 # Handle "False" string for api_key 

102 self.api_key = ( 

103 api_key 

104 if api_key and api_key not in ["False", "false", ""] 

105 else None 

106 ) 

107 

108 # API configuration 

109 self.api_base = "https://api.adsabs.harvard.edu/v1" 

110 self.headers = { 

111 "User-Agent": "Local-Deep-Research-Agent", 

112 "Accept": "application/json", 

113 } 

114 

115 if self.api_key: 

116 self.headers["Authorization"] = f"Bearer {self.api_key}" 

117 logger.info("Using NASA ADS with API key") 

118 else: 

119 logger.error( 

120 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token" 

121 ) 

122 

123 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

124 """ 

125 Get preview information for NASA ADS search results. 

126 

127 Args: 

128 query: The search query (natural language supported) 

129 

130 Returns: 

131 List of preview dictionaries 

132 """ 

133 logger.info(f"Searching NASA ADS for: {query}") 

134 

135 # Build the search query - NASA ADS has good natural language support 

136 # We can use the query directly or enhance it slightly 

137 search_query = query 

138 

139 # Build filters 

140 filters = [] 

141 if self.from_publication_date: 

142 # Convert YYYY-MM-DD to ADS format 

143 try: 

144 year = self.from_publication_date.split("-")[0] 

145 if year.isdigit(): # Only add if it's a valid year 145 ↛ 153line 145 didn't jump to line 153 because the condition on line 145 was always true

146 filters.append(f"year:{year}-9999") 

147 except Exception: 

148 logger.debug( 

149 "best-effort date parsing, invalid formats skipped", 

150 exc_info=True, 

151 ) 

152 

153 if self.min_citations > 0: 

154 filters.append(f"citation_count:[{self.min_citations} TO *]") 

155 

156 if not self.include_arxiv: 

157 filters.append('-bibstem:"arXiv"') 

158 

159 # Combine query with filters 

160 if filters: 

161 full_query = f"{search_query} {' '.join(filters)}" 

162 else: 

163 full_query = search_query 

164 

165 # Build request parameters 

166 params = { 

167 "q": full_query, 

168 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff", 

169 "rows": min( 

170 self.max_results, 200 

171 ), # NASA ADS allows up to 200 per request 

172 "start": 0, 

173 } 

174 

175 # Add sorting 

176 sort_map = { 

177 "relevance": "score desc", 

178 "citation_count": "citation_count desc", 

179 "date": "date desc", 

180 } 

181 params["sort"] = sort_map.get(self.sort_by, "score desc") 

182 

183 try: 

184 # Apply rate limiting (simple like PubMed) 

185 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

186 self.engine_type 

187 ) 

188 logger.debug( 

189 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

190 ) 

191 

192 # Make the API request 

193 logger.info( 

194 f"Making NASA ADS API request with query: {str(params['q'])[:100]}..." 

195 ) 

196 response = safe_get( 

197 f"{self.api_base}/search/query", 

198 params=params, 

199 headers=self.headers, 

200 timeout=30, 

201 ) 

202 

203 # Log rate limit headers if available 

204 if "X-RateLimit-Remaining" in response.headers: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 remaining = response.headers.get("X-RateLimit-Remaining") 

206 limit = response.headers.get("X-RateLimit-Limit", "unknown") 

207 logger.debug( 

208 f"NASA ADS rate limit: {remaining}/{limit} requests remaining" 

209 ) 

210 

211 if response.status_code == 200: 

212 data = response.json() 

213 docs = data.get("response", {}).get("docs", []) 

214 num_found = data.get("response", {}).get("numFound", 0) 

215 

216 logger.info( 

217 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})" 

218 ) 

219 

220 # Format results as previews 

221 previews = [] 

222 for doc in docs: 

223 preview = self._format_doc_preview(doc) 

224 if preview: 224 ↛ 222line 224 didn't jump to line 222 because the condition on line 224 was always true

225 previews.append(preview) 

226 

227 logger.info(f"Successfully formatted {len(previews)} previews") 

228 return previews 

229 

230 if response.status_code == 429: 

231 # Rate limited 

232 logger.warning("NASA ADS rate limit reached") 

233 raise RateLimitError("NASA ADS rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry 

234 

235 if response.status_code == 401: 

236 logger.error("NASA ADS API key is invalid or missing") 

237 return [] 

238 

239 logger.error( 

240 f"NASA ADS API error: {response.status_code} - {response.text[:200]}" 

241 ) 

242 return [] 

243 

244 except RateLimitError: 

245 # Re-raise rate limit errors for base class retry handling 

246 raise 

247 except Exception: 

248 logger.exception("Error searching NASA ADS") 

249 return [] 

250 

251 def _format_doc_preview( 

252 self, doc: Dict[str, Any] 

253 ) -> Optional[Dict[str, Any]]: 

254 """ 

255 Format a NASA ADS document as a preview dictionary. 

256 

257 Args: 

258 doc: NASA ADS document object 

259 

260 Returns: 

261 Formatted preview dictionary or None if formatting fails 

262 """ 

263 try: 

264 # Extract basic information 

265 bibcode = doc.get("bibcode", "") 

266 # Get title from list if available 

267 title_list = doc.get("title", []) 

268 title = title_list[0] if title_list else "No title" 

269 

270 # Get abstract or create snippet 

271 abstract = doc.get("abstract", "") 

272 snippet = ( 

273 abstract[:SNIPPET_LENGTH_LONG] 

274 if abstract 

275 else f"Academic paper: {title}" 

276 ) 

277 

278 # Get publication info 

279 year = doc.get("year", "unknown") 

280 pubdate = doc.get("pubdate", "unknown") 

281 

282 # Get journal/source 

283 journal = "unknown" 

284 if doc.get("pub"): 

285 journal = str(doc.get("pub")) 

286 elif doc.get("bibstem"): 

287 bibstem = doc.get("bibstem", []) 

288 if bibstem: 288 ↛ 294line 288 didn't jump to line 294 because the condition on line 288 was always true

289 journal = ( 

290 bibstem[0] if isinstance(bibstem, list) else bibstem 

291 ) 

292 

293 # Get authors 

294 authors = doc.get("author", []) 

295 authors_str = ", ".join(authors[:5]) 

296 if len(authors) > 5: 

297 authors_str += " et al." 

298 

299 # Get metrics 

300 citation_count = doc.get("citation_count", 0) 

301 

302 # Get URL - prefer DOI, fallback to ADS URL 

303 url = None 

304 if doc.get("doi"): 

305 dois = doc.get("doi", []) 

306 if dois: 306 ↛ 310line 306 didn't jump to line 310 because the condition on line 306 was always true

307 doi = dois[0] if isinstance(dois, list) else dois 

308 url = f"https://doi.org/{doi}" 

309 

310 if not url: 

311 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}" 

312 

313 # Check if it's ArXiv 

314 is_arxiv = "arXiv" in str(doc.get("bibstem", [])) 

315 

316 # Get keywords 

317 keywords = doc.get("keyword", []) 

318 

319 return { 

320 "id": bibcode, 

321 "title": title, 

322 "link": url, 

323 "snippet": snippet, 

324 "authors": authors_str, 

325 "year": year, 

326 "date": pubdate, 

327 "journal": journal, 

328 "citations": citation_count, 

329 "abstract": abstract, 

330 "is_arxiv": is_arxiv, 

331 "keywords": keywords[:5] if keywords else [], 

332 "type": "academic_paper", 

333 } 

334 

335 except Exception: 

336 logger.exception( 

337 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}" 

338 ) 

339 return None 

340 

341 def _get_full_content( 

342 self, relevant_items: List[Dict[str, Any]] 

343 ) -> List[Dict[str, Any]]: 

344 """ 

345 Get full content for relevant items (NASA ADS provides most content in preview). 

346 

347 Args: 

348 relevant_items: List of relevant preview dictionaries 

349 

350 Returns: 

351 List of result dictionaries with full content 

352 """ 

353 # NASA ADS returns comprehensive data in the initial search, 

354 # so we don't need a separate full content fetch 

355 results = [] 

356 for item in relevant_items: 

357 result = { 

358 "title": item.get("title", ""), 

359 "link": item.get("link", ""), 

360 "snippet": item.get("snippet", ""), 

361 "content": item.get("abstract", item.get("snippet", "")), 

362 "metadata": { 

363 "authors": item.get("authors", ""), 

364 "year": item.get("year", ""), 

365 "journal": item.get("journal", ""), 

366 "citations": item.get("citations", 0), 

367 "is_arxiv": item.get("is_arxiv", False), 

368 "keywords": item.get("keywords", []), 

369 }, 

370 } 

371 results.append(result) 

372 

373 return results