Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 88%

131 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""NASA Astrophysics Data System (ADS) search engine implementation.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...constants import SNIPPET_LENGTH_LONG 

9from ...advanced_search_system.filters.journal_reputation_filter import ( 

10 JournalReputationFilter, 

11) 

12from ...security.safe_requests import safe_get 

13from ..rate_limiting import RateLimitError 

14from ..search_engine_base import BaseSearchEngine 

15 

16 

17class NasaAdsSearchEngine(BaseSearchEngine): 

18 """NASA ADS search engine for physics, astronomy, and astrophysics papers.""" 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Scientific/astronomy/astrophysics search engine 

23 is_scientific = True 

24 

25 def __init__( 

26 self, 

27 max_results: int = 25, 

28 api_key: Optional[str] = None, 

29 sort_by: str = "relevance", 

30 min_citations: int = 0, 

31 from_publication_date: Optional[str] = None, 

32 include_arxiv: bool = True, 

33 llm: Optional[BaseLLM] = None, 

34 max_filtered_results: Optional[int] = None, 

35 settings_snapshot: Optional[Dict[str, Any]] = None, 

36 **kwargs, 

37 ): 

38 """ 

39 Initialize the NASA ADS search engine. 

40 

41 Args: 

42 max_results: Maximum number of search results 

43 api_key: NASA ADS API key (required for higher rate limits) 

44 sort_by: Sort order ('relevance', 'citation_count', 'date') 

45 min_citations: Minimum citation count filter 

46 from_publication_date: Filter papers from this date (YYYY-MM-DD) 

47 include_arxiv: Include ArXiv preprints in results 

48 llm: Language model for relevance filtering 

49 max_filtered_results: Maximum number of results to keep after filtering 

50 settings_snapshot: Settings snapshot for configuration 

51 **kwargs: Additional parameters to pass to parent class 

52 """ 

53 # Initialize journal reputation filter if needed 

54 content_filters = [] 

55 journal_filter = JournalReputationFilter.create_default( 

56 model=llm, 

57 engine_name="nasa_ads", 

58 settings_snapshot=settings_snapshot, 

59 ) 

60 if journal_filter is not None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 content_filters.append(journal_filter) 

62 

63 # Initialize the BaseSearchEngine 

64 super().__init__( 

65 llm=llm, 

66 max_filtered_results=max_filtered_results, 

67 max_results=max_results, 

68 content_filters=content_filters, 

69 settings_snapshot=settings_snapshot, 

70 **kwargs, 

71 ) 

72 

73 self.sort_by = sort_by 

74 self.min_citations = min_citations 

75 self.include_arxiv = include_arxiv 

76 # Handle from_publication_date 

77 self.from_publication_date = ( 

78 from_publication_date 

79 if from_publication_date 

80 and from_publication_date not in ["False", "false", ""] 

81 else None 

82 ) 

83 

84 # Get API key from settings if not provided 

85 if not api_key and settings_snapshot: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 from ...config.search_config import get_setting_from_snapshot 

87 

88 try: 

89 api_key = get_setting_from_snapshot( 

90 "search.engine.web.nasa_ads.api_key", 

91 settings_snapshot=settings_snapshot, 

92 ) 

93 except Exception: 

94 pass 

95 

96 # Handle "False" string for api_key 

97 self.api_key = ( 

98 api_key 

99 if api_key and api_key not in ["False", "false", ""] 

100 else None 

101 ) 

102 

103 # API configuration 

104 self.api_base = "https://api.adsabs.harvard.edu/v1" 

105 self.headers = { 

106 "User-Agent": "Local-Deep-Research-Agent", 

107 "Accept": "application/json", 

108 } 

109 

110 if self.api_key: 

111 self.headers["Authorization"] = f"Bearer {self.api_key}" 

112 logger.info("Using NASA ADS with API key") 

113 else: 

114 logger.error( 

115 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token" 

116 ) 

117 

118 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

119 """ 

120 Get preview information for NASA ADS search results. 

121 

122 Args: 

123 query: The search query (natural language supported) 

124 

125 Returns: 

126 List of preview dictionaries 

127 """ 

128 logger.info(f"Searching NASA ADS for: {query}") 

129 

130 # Build the search query - NASA ADS has good natural language support 

131 # We can use the query directly or enhance it slightly 

132 search_query = query 

133 

134 # Build filters 

135 filters = [] 

136 if self.from_publication_date: 

137 # Convert YYYY-MM-DD to ADS format 

138 try: 

139 year = self.from_publication_date.split("-")[0] 

140 if year.isdigit(): # Only add if it's a valid year 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true

141 filters.append(f"year:{year}-9999") 

142 except Exception: 

143 pass # Skip invalid date formats 

144 

145 if self.min_citations > 0: 

146 filters.append(f"citation_count:[{self.min_citations} TO *]") 

147 

148 if not self.include_arxiv: 

149 filters.append('-bibstem:"arXiv"') 

150 

151 # Combine query with filters 

152 if filters: 

153 full_query = f"{search_query} {' '.join(filters)}" 

154 else: 

155 full_query = search_query 

156 

157 # Build request parameters 

158 params = { 

159 "q": full_query, 

160 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff", 

161 "rows": min( 

162 self.max_results, 200 

163 ), # NASA ADS allows up to 200 per request 

164 "start": 0, 

165 } 

166 

167 # Add sorting 

168 sort_map = { 

169 "relevance": "score desc", 

170 "citation_count": "citation_count desc", 

171 "date": "date desc", 

172 } 

173 params["sort"] = sort_map.get(self.sort_by, "score desc") 

174 

175 try: 

176 # Apply rate limiting (simple like PubMed) 

177 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

178 self.engine_type 

179 ) 

180 logger.debug( 

181 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

182 ) 

183 

184 # Make the API request 

185 logger.info( 

186 f"Making NASA ADS API request with query: {params['q'][:100]}..." 

187 ) 

188 response = safe_get( 

189 f"{self.api_base}/search/query", 

190 params=params, 

191 headers=self.headers, 

192 timeout=30, 

193 ) 

194 

195 # Log rate limit headers if available 

196 if "X-RateLimit-Remaining" in response.headers: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 remaining = response.headers.get("X-RateLimit-Remaining") 

198 limit = response.headers.get("X-RateLimit-Limit", "unknown") 

199 logger.debug( 

200 f"NASA ADS rate limit: {remaining}/{limit} requests remaining" 

201 ) 

202 

203 if response.status_code == 200: 

204 data = response.json() 

205 docs = data.get("response", {}).get("docs", []) 

206 num_found = data.get("response", {}).get("numFound", 0) 

207 

208 logger.info( 

209 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})" 

210 ) 

211 

212 # Format results as previews 

213 previews = [] 

214 for doc in docs: 

215 preview = self._format_doc_preview(doc) 

216 if preview: 216 ↛ 214line 216 didn't jump to line 214 because the condition on line 216 was always true

217 previews.append(preview) 

218 

219 logger.info(f"Successfully formatted {len(previews)} previews") 

220 return previews 

221 

222 elif response.status_code == 429: 

223 # Rate limited 

224 logger.warning("NASA ADS rate limit reached") 

225 raise RateLimitError("NASA ADS rate limit exceeded") 

226 

227 elif response.status_code == 401: 

228 logger.error("NASA ADS API key is invalid or missing") 

229 return [] 

230 

231 else: 

232 logger.error( 

233 f"NASA ADS API error: {response.status_code} - {response.text[:200]}" 

234 ) 

235 return [] 

236 

237 except RateLimitError: 

238 # Re-raise rate limit errors for base class retry handling 

239 raise 

240 except Exception: 

241 logger.exception("Error searching NASA ADS") 

242 return [] 

243 

244 def _format_doc_preview( 

245 self, doc: Dict[str, Any] 

246 ) -> Optional[Dict[str, Any]]: 

247 """ 

248 Format a NASA ADS document as a preview dictionary. 

249 

250 Args: 

251 doc: NASA ADS document object 

252 

253 Returns: 

254 Formatted preview dictionary or None if formatting fails 

255 """ 

256 try: 

257 # Extract basic information 

258 bibcode = doc.get("bibcode", "") 

259 # Get title from list if available 

260 title_list = doc.get("title", []) 

261 title = title_list[0] if title_list else "No title" 

262 

263 # Get abstract or create snippet 

264 abstract = doc.get("abstract", "") 

265 snippet = ( 

266 abstract[:SNIPPET_LENGTH_LONG] 

267 if abstract 

268 else f"Academic paper: {title}" 

269 ) 

270 

271 # Get publication info 

272 year = doc.get("year", "unknown") 

273 pubdate = doc.get("pubdate", "unknown") 

274 

275 # Get journal/source 

276 journal = "unknown" 

277 if doc.get("pub"): 

278 journal = doc.get("pub") 

279 elif doc.get("bibstem"): 

280 bibstem = doc.get("bibstem", []) 

281 if bibstem: 281 ↛ 287line 281 didn't jump to line 287 because the condition on line 281 was always true

282 journal = ( 

283 bibstem[0] if isinstance(bibstem, list) else bibstem 

284 ) 

285 

286 # Get authors 

287 authors = doc.get("author", []) 

288 authors_str = ", ".join(authors[:5]) 

289 if len(authors) > 5: 

290 authors_str += " et al." 

291 

292 # Get metrics 

293 citation_count = doc.get("citation_count", 0) 

294 

295 # Get URL - prefer DOI, fallback to ADS URL 

296 url = None 

297 if doc.get("doi"): 

298 dois = doc.get("doi", []) 

299 if dois: 299 ↛ 303line 299 didn't jump to line 303 because the condition on line 299 was always true

300 doi = dois[0] if isinstance(dois, list) else dois 

301 url = f"https://doi.org/{doi}" 

302 

303 if not url: 

304 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}" 

305 

306 # Check if it's ArXiv 

307 is_arxiv = "arXiv" in str(doc.get("bibstem", [])) 

308 

309 # Get keywords 

310 keywords = doc.get("keyword", []) 

311 

312 preview = { 

313 "id": bibcode, 

314 "title": title, 

315 "link": url, 

316 "snippet": snippet, 

317 "authors": authors_str, 

318 "year": year, 

319 "date": pubdate, 

320 "journal": journal, 

321 "citations": citation_count, 

322 "abstract": abstract, 

323 "is_arxiv": is_arxiv, 

324 "keywords": keywords[:5] if keywords else [], 

325 "type": "academic_paper", 

326 } 

327 

328 return preview 

329 

330 except Exception: 

331 logger.exception( 

332 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}" 

333 ) 

334 return None 

335 

336 def _get_full_content( 

337 self, relevant_items: List[Dict[str, Any]] 

338 ) -> List[Dict[str, Any]]: 

339 """ 

340 Get full content for relevant items (NASA ADS provides most content in preview). 

341 

342 Args: 

343 relevant_items: List of relevant preview dictionaries 

344 

345 Returns: 

346 List of result dictionaries with full content 

347 """ 

348 # NASA ADS returns comprehensive data in the initial search, 

349 # so we don't need a separate full content fetch 

350 results = [] 

351 for item in relevant_items: 

352 result = { 

353 "title": item.get("title", ""), 

354 "link": item.get("link", ""), 

355 "snippet": item.get("snippet", ""), 

356 "content": item.get("abstract", item.get("snippet", "")), 

357 "metadata": { 

358 "authors": item.get("authors", ""), 

359 "year": item.get("year", ""), 

360 "journal": item.get("journal", ""), 

361 "citations": item.get("citations", 0), 

362 "is_arxiv": item.get("is_arxiv", False), 

363 "keywords": item.get("keywords", []), 

364 }, 

365 } 

366 results.append(result) 

367 

368 return results