Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 84%

130 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1"""NASA Astrophysics Data System (ADS) search engine implementation.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...advanced_search_system.filters.journal_reputation_filter import ( 

9 JournalReputationFilter, 

10) 

11from ...security.safe_requests import safe_get 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class NasaAdsSearchEngine(BaseSearchEngine): 

17 """NASA ADS search engine for physics, astronomy, and astrophysics papers.""" 

18 

19 # Mark as public search engine 

20 is_public = True 

21 # Scientific/astronomy/astrophysics search engine 

22 is_scientific = True 

23 

24 def __init__( 

25 self, 

26 max_results: int = 25, 

27 api_key: Optional[str] = None, 

28 sort_by: str = "relevance", 

29 min_citations: int = 0, 

30 from_publication_date: Optional[str] = None, 

31 include_arxiv: bool = True, 

32 llm: Optional[BaseLLM] = None, 

33 max_filtered_results: Optional[int] = None, 

34 settings_snapshot: Optional[Dict[str, Any]] = None, 

35 **kwargs, 

36 ): 

37 """ 

38 Initialize the NASA ADS search engine. 

39 

40 Args: 

41 max_results: Maximum number of search results 

42 api_key: NASA ADS API key (required for higher rate limits) 

43 sort_by: Sort order ('relevance', 'citation_count', 'date') 

44 min_citations: Minimum citation count filter 

45 from_publication_date: Filter papers from this date (YYYY-MM-DD) 

46 include_arxiv: Include ArXiv preprints in results 

47 llm: Language model for relevance filtering 

48 max_filtered_results: Maximum number of results to keep after filtering 

49 settings_snapshot: Settings snapshot for configuration 

50 **kwargs: Additional parameters to pass to parent class 

51 """ 

52 # Initialize journal reputation filter if needed 

53 content_filters = [] 

54 journal_filter = JournalReputationFilter.create_default( 

55 model=llm, 

56 engine_name="nasa_ads", 

57 settings_snapshot=settings_snapshot, 

58 ) 

59 if journal_filter is not None: 

60 content_filters.append(journal_filter) 

61 

62 # Initialize the BaseSearchEngine 

63 super().__init__( 

64 llm=llm, 

65 max_filtered_results=max_filtered_results, 

66 max_results=max_results, 

67 content_filters=content_filters, 

68 settings_snapshot=settings_snapshot, 

69 **kwargs, 

70 ) 

71 

72 self.sort_by = sort_by 

73 self.min_citations = min_citations 

74 self.include_arxiv = include_arxiv 

75 # Handle from_publication_date 

76 self.from_publication_date = ( 

77 from_publication_date 

78 if from_publication_date 

79 and from_publication_date not in ["False", "false", ""] 

80 else None 

81 ) 

82 

83 # Get API key from settings if not provided 

84 if not api_key and settings_snapshot: 

85 from ...config.search_config import get_setting_from_snapshot 

86 

87 try: 

88 api_key = get_setting_from_snapshot( 

89 "search.engine.web.nasa_ads.api_key", 

90 settings_snapshot=settings_snapshot, 

91 ) 

92 except Exception: 

93 pass 

94 

95 # Handle "False" string for api_key 

96 self.api_key = ( 

97 api_key 

98 if api_key and api_key not in ["False", "false", ""] 

99 else None 

100 ) 

101 

102 # API configuration 

103 self.api_base = "https://api.adsabs.harvard.edu/v1" 

104 self.headers = { 

105 "User-Agent": "Local-Deep-Research-Agent", 

106 "Accept": "application/json", 

107 } 

108 

109 if self.api_key: 

110 self.headers["Authorization"] = f"Bearer {self.api_key}" 

111 logger.info("Using NASA ADS with API key") 

112 else: 

113 logger.error( 

114 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token" 

115 ) 

116 

117 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

118 """ 

119 Get preview information for NASA ADS search results. 

120 

121 Args: 

122 query: The search query (natural language supported) 

123 

124 Returns: 

125 List of preview dictionaries 

126 """ 

127 logger.info(f"Searching NASA ADS for: {query}") 

128 

129 # Build the search query - NASA ADS has good natural language support 

130 # We can use the query directly or enhance it slightly 

131 search_query = query 

132 

133 # Build filters 

134 filters = [] 

135 if self.from_publication_date: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was never true

136 # Convert YYYY-MM-DD to ADS format 

137 try: 

138 year = self.from_publication_date.split("-")[0] 

139 if year.isdigit(): # Only add if it's a valid year 

140 filters.append(f"year:{year}-9999") 

141 except Exception: 

142 pass # Skip invalid date formats 

143 

144 if self.min_citations > 0: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 filters.append(f"citation_count:[{self.min_citations} TO *]") 

146 

147 if not self.include_arxiv: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 filters.append('-bibstem:"arXiv"') 

149 

150 # Combine query with filters 

151 if filters: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 full_query = f"{search_query} {' '.join(filters)}" 

153 else: 

154 full_query = search_query 

155 

156 # Build request parameters 

157 params = { 

158 "q": full_query, 

159 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff", 

160 "rows": min( 

161 self.max_results, 200 

162 ), # NASA ADS allows up to 200 per request 

163 "start": 0, 

164 } 

165 

166 # Add sorting 

167 sort_map = { 

168 "relevance": "score desc", 

169 "citation_count": "citation_count desc", 

170 "date": "date desc", 

171 } 

172 params["sort"] = sort_map.get(self.sort_by, "score desc") 

173 

174 try: 

175 # Apply rate limiting (simple like PubMed) 

176 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

177 self.engine_type 

178 ) 

179 logger.debug( 

180 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

181 ) 

182 

183 # Make the API request 

184 logger.info( 

185 f"Making NASA ADS API request with query: {params['q'][:100]}..." 

186 ) 

187 response = safe_get( 

188 f"{self.api_base}/search/query", 

189 params=params, 

190 headers=self.headers, 

191 timeout=30, 

192 ) 

193 

194 # Log rate limit headers if available 

195 if "X-RateLimit-Remaining" in response.headers: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 remaining = response.headers.get("X-RateLimit-Remaining") 

197 limit = response.headers.get("X-RateLimit-Limit", "unknown") 

198 logger.debug( 

199 f"NASA ADS rate limit: {remaining}/{limit} requests remaining" 

200 ) 

201 

202 if response.status_code == 200: 

203 data = response.json() 

204 docs = data.get("response", {}).get("docs", []) 

205 num_found = data.get("response", {}).get("numFound", 0) 

206 

207 logger.info( 

208 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})" 

209 ) 

210 

211 # Format results as previews 

212 previews = [] 

213 for doc in docs: 

214 preview = self._format_doc_preview(doc) 

215 if preview: 215 ↛ 213line 215 didn't jump to line 213 because the condition on line 215 was always true

216 previews.append(preview) 

217 

218 logger.info(f"Successfully formatted {len(previews)} previews") 

219 return previews 

220 

221 elif response.status_code == 429: 

222 # Rate limited 

223 logger.warning("NASA ADS rate limit reached") 

224 raise RateLimitError("NASA ADS rate limit exceeded") 

225 

226 elif response.status_code == 401: 

227 logger.error("NASA ADS API key is invalid or missing") 

228 return [] 

229 

230 else: 

231 logger.error( 

232 f"NASA ADS API error: {response.status_code} - {response.text[:200]}" 

233 ) 

234 return [] 

235 

236 except RateLimitError: 

237 # Re-raise rate limit errors for base class retry handling 

238 raise 

239 except Exception: 

240 logger.exception("Error searching NASA ADS") 

241 return [] 

242 

243 def _format_doc_preview( 

244 self, doc: Dict[str, Any] 

245 ) -> Optional[Dict[str, Any]]: 

246 """ 

247 Format a NASA ADS document as a preview dictionary. 

248 

249 Args: 

250 doc: NASA ADS document object 

251 

252 Returns: 

253 Formatted preview dictionary or None if formatting fails 

254 """ 

255 try: 

256 # Extract basic information 

257 bibcode = doc.get("bibcode", "") 

258 # Get title from list if available 

259 title_list = doc.get("title", []) 

260 title = title_list[0] if title_list else "No title" 

261 

262 # Get abstract or create snippet 

263 abstract = doc.get("abstract", "") 

264 snippet = abstract[:500] if abstract else f"Academic paper: {title}" 

265 

266 # Get publication info 

267 year = doc.get("year", "unknown") 

268 pubdate = doc.get("pubdate", "unknown") 

269 

270 # Get journal/source 

271 journal = "unknown" 

272 if doc.get("pub"): 

273 journal = doc.get("pub") 

274 elif doc.get("bibstem"): 

275 bibstem = doc.get("bibstem", []) 

276 if bibstem: 276 ↛ 282line 276 didn't jump to line 282 because the condition on line 276 was always true

277 journal = ( 

278 bibstem[0] if isinstance(bibstem, list) else bibstem 

279 ) 

280 

281 # Get authors 

282 authors = doc.get("author", []) 

283 authors_str = ", ".join(authors[:5]) 

284 if len(authors) > 5: 

285 authors_str += " et al." 

286 

287 # Get metrics 

288 citation_count = doc.get("citation_count", 0) 

289 

290 # Get URL - prefer DOI, fallback to ADS URL 

291 url = None 

292 if doc.get("doi"): 

293 dois = doc.get("doi", []) 

294 if dois: 294 ↛ 298line 294 didn't jump to line 298 because the condition on line 294 was always true

295 doi = dois[0] if isinstance(dois, list) else dois 

296 url = f"https://doi.org/{doi}" 

297 

298 if not url: 

299 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}" 

300 

301 # Check if it's ArXiv 

302 is_arxiv = "arXiv" in str(doc.get("bibstem", [])) 

303 

304 # Get keywords 

305 keywords = doc.get("keyword", []) 

306 

307 preview = { 

308 "id": bibcode, 

309 "title": title, 

310 "link": url, 

311 "snippet": snippet, 

312 "authors": authors_str, 

313 "year": year, 

314 "date": pubdate, 

315 "journal": journal, 

316 "citations": citation_count, 

317 "abstract": abstract, 

318 "is_arxiv": is_arxiv, 

319 "keywords": keywords[:5] if keywords else [], 

320 "type": "academic_paper", 

321 } 

322 

323 return preview 

324 

325 except Exception: 

326 logger.exception( 

327 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}" 

328 ) 

329 return None 

330 

331 def _get_full_content( 

332 self, relevant_items: List[Dict[str, Any]] 

333 ) -> List[Dict[str, Any]]: 

334 """ 

335 Get full content for relevant items (NASA ADS provides most content in preview). 

336 

337 Args: 

338 relevant_items: List of relevant preview dictionaries 

339 

340 Returns: 

341 List of result dictionaries with full content 

342 """ 

343 # NASA ADS returns comprehensive data in the initial search, 

344 # so we don't need a separate full content fetch 

345 results = [] 

346 for item in relevant_items: 

347 result = { 

348 "title": item.get("title", ""), 

349 "link": item.get("link", ""), 

350 "snippet": item.get("snippet", ""), 

351 "content": item.get("abstract", item.get("snippet", "")), 

352 "metadata": { 

353 "authors": item.get("authors", ""), 

354 "year": item.get("year", ""), 

355 "journal": item.get("journal", ""), 

356 "citations": item.get("citations", 0), 

357 "is_arxiv": item.get("is_arxiv", False), 

358 "keywords": item.get("keywords", []), 

359 }, 

360 } 

361 results.append(result) 

362 

363 return results