Coverage for src/local_deep_research/web_search_engines/engines/search_engine_nasa_ads.py: 89%

146 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""NASA Astrophysics Data System (ADS) search engine implementation.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...constants import SNIPPET_LENGTH_LONG, USER_AGENT 

9from ...advanced_search_system.filters.journal_reputation_filter import ( 

10 JournalReputationFilter, 

11) 

12from ...security.safe_requests import safe_get 

13from ..rate_limiting import RateLimitError 

14from ..search_engine_base import BaseSearchEngine 

15 

16 

17class NasaAdsSearchEngine(BaseSearchEngine): 

18 """NASA ADS search engine for physics, astronomy, and astrophysics papers.""" 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Scientific/astronomy/astrophysics search engine 

23 is_scientific = True 

24 is_lexical = True 

25 needs_llm_relevance_filter = True 

26 

27 def __init__( 

28 self, 

29 max_results: int = 25, 

30 api_key: Optional[str] = None, 

31 sort_by: str = "relevance", 

32 min_citations: int = 0, 

33 from_publication_date: Optional[str] = None, 

34 include_arxiv: bool = True, 

35 llm: Optional[BaseLLM] = None, 

36 max_filtered_results: Optional[int] = None, 

37 settings_snapshot: Optional[Dict[str, Any]] = None, 

38 **kwargs, 

39 ): 

40 """ 

41 Initialize the NASA ADS search engine. 

42 

43 Args: 

44 max_results: Maximum number of search results 

45 api_key: NASA ADS API key (required for higher rate limits) 

46 sort_by: Sort order ('relevance', 'citation_count', 'date') 

47 min_citations: Minimum citation count filter 

48 from_publication_date: Filter papers from this date (YYYY-MM-DD) 

49 include_arxiv: Include ArXiv preprints in results 

50 llm: Language model for relevance filtering 

51 max_filtered_results: Maximum number of results to keep after filtering 

52 settings_snapshot: Settings snapshot for configuration 

53 **kwargs: Additional parameters to pass to parent class 

54 """ 

55 # Journal filter runs before LLM relevance (Tiers 1-3 are instant) 

56 preview_filters = [] 

57 journal_filter = JournalReputationFilter.create_default( 

58 model=llm, # type: ignore[arg-type] 

59 engine_name="nasa_ads", 

60 settings_snapshot=settings_snapshot, 

61 ) 

62 if journal_filter is not None: 

63 preview_filters.append(journal_filter) 

64 

65 super().__init__( 

66 llm=llm, 

67 max_filtered_results=max_filtered_results, 

68 max_results=max_results, 

69 preview_filters=preview_filters, # type: ignore[arg-type] 

70 settings_snapshot=settings_snapshot, 

71 **kwargs, 

72 ) 

73 

74 self.sort_by = sort_by 

75 self.min_citations = min_citations 

76 self.include_arxiv = include_arxiv 

77 # Handle from_publication_date 

78 self.from_publication_date = ( 

79 from_publication_date 

80 if from_publication_date 

81 and from_publication_date not in ["False", "false", ""] 

82 else None 

83 ) 

84 

85 # Get API key from settings if not provided 

86 if not api_key and settings_snapshot: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 from ...config.search_config import get_setting_from_snapshot 

88 

89 try: 

90 api_key = get_setting_from_snapshot( 

91 "search.engine.web.nasa_ads.api_key", 

92 settings_snapshot=settings_snapshot, 

93 ) 

94 except Exception: 

95 logger.debug( 

96 "Failed to read nasa_ads.api_key from settings snapshot", 

97 exc_info=True, 

98 ) 

99 

100 # Handle "False" string for api_key 

101 self.api_key = ( 

102 api_key 

103 if api_key and api_key not in ["False", "false", ""] 

104 else None 

105 ) 

106 

107 # API configuration 

108 self.api_base = "https://api.adsabs.harvard.edu/v1" 

109 self.headers = { 

110 "User-Agent": USER_AGENT, 

111 "Accept": "application/json", 

112 } 

113 

114 if self.api_key: 

115 self.headers["Authorization"] = f"Bearer {self.api_key}" 

116 logger.info("Using NASA ADS with API key") 

117 else: 

118 logger.error( 

119 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token" 

120 ) 

121 

122 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

123 """ 

124 Get preview information for NASA ADS search results. 

125 

126 Args: 

127 query: The search query (natural language supported) 

128 

129 Returns: 

130 List of preview dictionaries 

131 """ 

132 logger.info(f"Searching NASA ADS for: {query}") 

133 

134 # Build the search query - NASA ADS has good natural language support 

135 # We can use the query directly or enhance it slightly 

136 search_query = query 

137 

138 # Build filters 

139 filters = [] 

140 if self.from_publication_date: 

141 # Convert YYYY-MM-DD to ADS format 

142 try: 

143 year = self.from_publication_date.split("-")[0] 

144 if year.isdigit(): # Only add if it's a valid year 144 ↛ 152line 144 didn't jump to line 152 because the condition on line 144 was always true

145 filters.append(f"year:{year}-9999") 

146 except Exception: 

147 logger.debug( 

148 "best-effort date parsing, invalid formats skipped", 

149 exc_info=True, 

150 ) 

151 

152 if self.min_citations > 0: 

153 filters.append(f"citation_count:[{self.min_citations} TO *]") 

154 

155 if not self.include_arxiv: 

156 filters.append('-bibstem:"arXiv"') 

157 

158 # Combine query with filters 

159 if filters: 

160 full_query = f"{search_query} {' '.join(filters)}" 

161 else: 

162 full_query = search_query 

163 

164 # Build request parameters 

165 params = { 

166 "q": full_query, 

167 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff", 

168 "rows": min( 

169 self.max_results, 200 

170 ), # NASA ADS allows up to 200 per request 

171 "start": 0, 

172 } 

173 

174 # Add sorting 

175 sort_map = { 

176 "relevance": "score desc", 

177 "citation_count": "citation_count desc", 

178 "date": "date desc", 

179 } 

180 params["sort"] = sort_map.get(self.sort_by, "score desc") 

181 

182 try: 

183 # Apply rate limiting (simple like PubMed) 

184 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

185 self.engine_type 

186 ) 

187 logger.debug( 

188 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

189 ) 

190 

191 # Make the API request 

192 logger.info( 

193 f"Making NASA ADS API request with query: {str(params['q'])[:100]}..." 

194 ) 

195 response = safe_get( 

196 f"{self.api_base}/search/query", 

197 params=params, 

198 headers=self.headers, 

199 timeout=30, 

200 ) 

201 

202 # Log rate limit headers if available 

203 if "X-RateLimit-Remaining" in response.headers: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 remaining = response.headers.get("X-RateLimit-Remaining") 

205 limit = response.headers.get("X-RateLimit-Limit", "unknown") 

206 logger.debug( 

207 f"NASA ADS rate limit: {remaining}/{limit} requests remaining" 

208 ) 

209 

210 if response.status_code == 200: 

211 data = response.json() 

212 docs = data.get("response", {}).get("docs", []) 

213 num_found = data.get("response", {}).get("numFound", 0) 

214 

215 logger.info( 

216 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})" 

217 ) 

218 

219 # Format results as previews 

220 previews = [] 

221 for doc in docs: 

222 preview = self._format_doc_preview(doc) 

223 if preview: 223 ↛ 221line 223 didn't jump to line 221 because the condition on line 223 was always true

224 previews.append(preview) 

225 

226 logger.info(f"Successfully formatted {len(previews)} previews") 

227 return previews 

228 

229 if response.status_code == 429: 

230 # Rate limited 

231 logger.warning("NASA ADS rate limit reached") 

232 raise RateLimitError("NASA ADS rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry 

233 

234 if response.status_code == 401: 

235 logger.error("NASA ADS API key is invalid or missing") 

236 return [] 

237 

238 logger.error( 

239 f"NASA ADS API error: {response.status_code} - {response.text[:200]}" 

240 ) 

241 return [] 

242 

243 except RateLimitError: 

244 # Re-raise rate limit errors for base class retry handling 

245 raise 

246 except Exception: 

247 logger.exception("Error searching NASA ADS") 

248 return [] 

249 

250 def _format_doc_preview( 

251 self, doc: Dict[str, Any] 

252 ) -> Optional[Dict[str, Any]]: 

253 """ 

254 Format a NASA ADS document as a preview dictionary. 

255 

256 Args: 

257 doc: NASA ADS document object 

258 

259 Returns: 

260 Formatted preview dictionary or None if formatting fails 

261 """ 

262 try: 

263 # Extract basic information 

264 bibcode = doc.get("bibcode", "") 

265 # Get title from list if available 

266 title_list = doc.get("title", []) 

267 title = title_list[0] if title_list else "No title" 

268 

269 # Get abstract or create snippet 

270 abstract = doc.get("abstract", "") 

271 snippet = ( 

272 abstract[:SNIPPET_LENGTH_LONG] 

273 if abstract 

274 else f"Academic paper: {title}" 

275 ) 

276 

277 # Get publication info 

278 year = doc.get("year", "unknown") 

279 pubdate = doc.get("pubdate", "unknown") 

280 

281 # Get journal/source 

282 journal = "unknown" 

283 if doc.get("pub"): 

284 journal = str(doc.get("pub")) 

285 elif doc.get("bibstem"): 

286 bibstem = doc.get("bibstem", []) 

287 if bibstem: 287 ↛ 293line 287 didn't jump to line 293 because the condition on line 287 was always true

288 journal = ( 

289 bibstem[0] if isinstance(bibstem, list) else bibstem 

290 ) 

291 

292 # Get authors 

293 authors = doc.get("author", []) 

294 authors_str = ", ".join(authors[:5]) 

295 if len(authors) > 5: 

296 authors_str += " et al." 

297 

298 # NASA ADS returns each name as "Last, First" — emit a 

299 # structured CSL list so the citation normalizer doesn't have 

300 # to re-split the comma-joined display string above and 

301 # mangle the family/given pairing in the process. 

302 authors_csl: list[dict] = [] 

303 for raw in authors[:5]: 

304 name = (raw or "").strip() 

305 if not name: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 continue 

307 if "," in name: 

308 family, _, given = name.partition(",") 

309 authors_csl.append( 

310 {"family": family.strip(), "given": given.strip()} 

311 ) 

312 else: 

313 authors_csl.append({"literal": name}) 

314 

315 # Get metrics 

316 citation_count = doc.get("citation_count", 0) 

317 

318 # Get URL - prefer DOI, fallback to ADS URL 

319 url = None 

320 if doc.get("doi"): 

321 dois = doc.get("doi", []) 

322 if dois: 322 ↛ 326line 322 didn't jump to line 326 because the condition on line 322 was always true

323 doi = dois[0] if isinstance(dois, list) else dois 

324 url = f"https://doi.org/{doi}" 

325 

326 if not url: 

327 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}" 

328 

329 # Check if it's ArXiv 

330 is_arxiv = "arXiv" in str(doc.get("bibstem", [])) 

331 

332 # Get keywords 

333 keywords = doc.get("keyword", []) 

334 

335 # Extract DOI for enrichment layer 

336 doi_value = None 

337 if doc.get("doi"): 

338 dois = doc.get("doi", []) 

339 if dois: 339 ↛ 342line 339 didn't jump to line 342 because the condition on line 339 was always true

340 doi_value = dois[0] if isinstance(dois, list) else dois 

341 

342 return { 

343 "id": bibcode, 

344 "title": title, 

345 "link": url, 

346 "snippet": snippet, 

347 "authors": authors_str, 

348 "authors_csl": authors_csl or None, 

349 "year": year, 

350 "date": pubdate, 

351 # Both fields emit None (not the "unknown" sentinel) when 

352 # no pub/bibstem is available. The "unknown" literal 

353 # leaked through the normalizer's container_title fallback 

354 # and even matched a real OpenAlex source named "unknown" 

355 # (Q1, h_index=5) in the reference DB. 

356 "journal": None if journal == "unknown" else journal, 

357 # ArXiv preprints have pub="arXiv e-prints" — set journal_ref 

358 # to None so the filter's preprint-handling path activates 

359 # instead of trying to score "arXiv e-prints" as a journal. 

360 "journal_ref": ( 

361 None if is_arxiv or journal == "unknown" else journal 

362 ), 

363 "doi": doi_value, 

364 "citations": citation_count, 

365 "abstract": abstract, 

366 "is_arxiv": is_arxiv, 

367 "keywords": keywords[:5] if keywords else [], 

368 "type": "academic_paper", 

369 } 

370 

371 except Exception: 

372 logger.exception( 

373 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}" 

374 ) 

375 return None 

376 

377 def _get_full_content( 

378 self, relevant_items: List[Dict[str, Any]] 

379 ) -> List[Dict[str, Any]]: 

380 """ 

381 Get full content for relevant items (NASA ADS provides most content in preview). 

382 

383 Args: 

384 relevant_items: List of relevant preview dictionaries 

385 

386 Returns: 

387 List of result dictionaries with full content 

388 """ 

389 # NASA ADS returns comprehensive data in the initial search, 

390 # so we don't need a separate full content fetch 

391 results = [] 

392 for item in relevant_items: 

393 result = { 

394 "title": item.get("title", ""), 

395 "link": item.get("link", ""), 

396 "snippet": item.get("snippet", ""), 

397 "content": item.get("abstract", item.get("snippet", "")), 

398 # Forward journal quality fields for content filters 

399 "journal_ref": item.get("journal_ref"), 

400 "doi": item.get("doi"), 

401 "metadata": { 

402 "authors": item.get("authors", ""), 

403 "year": item.get("year", ""), 

404 "journal": item.get("journal", ""), 

405 "citations": item.get("citations", 0), 

406 "is_arxiv": item.get("is_arxiv", False), 

407 "keywords": item.get("keywords", []), 

408 "doi": item.get("doi"), 

409 }, 

410 } 

411 results.append(result) 

412 

413 return results