Coverage for src/local_deep_research/web_search_engines/engines/search_engine_nasa_ads.py: 89%
146 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""NASA Astrophysics Data System (ADS) search engine implementation."""
3from typing import Any, Dict, List, Optional
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...constants import SNIPPET_LENGTH_LONG, USER_AGENT
9from ...advanced_search_system.filters.journal_reputation_filter import (
10 JournalReputationFilter,
11)
12from ...security.safe_requests import safe_get
13from ..rate_limiting import RateLimitError
14from ..search_engine_base import BaseSearchEngine
17class NasaAdsSearchEngine(BaseSearchEngine):
18 """NASA ADS search engine for physics, astronomy, and astrophysics papers."""
20 # Mark as public search engine
21 is_public = True
22 # Scientific/astronomy/astrophysics search engine
23 is_scientific = True
24 is_lexical = True
25 needs_llm_relevance_filter = True
27 def __init__(
28 self,
29 max_results: int = 25,
30 api_key: Optional[str] = None,
31 sort_by: str = "relevance",
32 min_citations: int = 0,
33 from_publication_date: Optional[str] = None,
34 include_arxiv: bool = True,
35 llm: Optional[BaseLLM] = None,
36 max_filtered_results: Optional[int] = None,
37 settings_snapshot: Optional[Dict[str, Any]] = None,
38 **kwargs,
39 ):
40 """
41 Initialize the NASA ADS search engine.
43 Args:
44 max_results: Maximum number of search results
45 api_key: NASA ADS API key (required for higher rate limits)
46 sort_by: Sort order ('relevance', 'citation_count', 'date')
47 min_citations: Minimum citation count filter
48 from_publication_date: Filter papers from this date (YYYY-MM-DD)
49 include_arxiv: Include ArXiv preprints in results
50 llm: Language model for relevance filtering
51 max_filtered_results: Maximum number of results to keep after filtering
52 settings_snapshot: Settings snapshot for configuration
53 **kwargs: Additional parameters to pass to parent class
54 """
55 # Journal filter runs before LLM relevance (Tiers 1-3 are instant)
56 preview_filters = []
57 journal_filter = JournalReputationFilter.create_default(
58 model=llm, # type: ignore[arg-type]
59 engine_name="nasa_ads",
60 settings_snapshot=settings_snapshot,
61 )
62 if journal_filter is not None:
63 preview_filters.append(journal_filter)
65 super().__init__(
66 llm=llm,
67 max_filtered_results=max_filtered_results,
68 max_results=max_results,
69 preview_filters=preview_filters, # type: ignore[arg-type]
70 settings_snapshot=settings_snapshot,
71 **kwargs,
72 )
74 self.sort_by = sort_by
75 self.min_citations = min_citations
76 self.include_arxiv = include_arxiv
77 # Handle from_publication_date
78 self.from_publication_date = (
79 from_publication_date
80 if from_publication_date
81 and from_publication_date not in ["False", "false", ""]
82 else None
83 )
85 # Get API key from settings if not provided
86 if not api_key and settings_snapshot: 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true
87 from ...config.search_config import get_setting_from_snapshot
89 try:
90 api_key = get_setting_from_snapshot(
91 "search.engine.web.nasa_ads.api_key",
92 settings_snapshot=settings_snapshot,
93 )
94 except Exception:
95 logger.debug(
96 "Failed to read nasa_ads.api_key from settings snapshot",
97 exc_info=True,
98 )
100 # Handle "False" string for api_key
101 self.api_key = (
102 api_key
103 if api_key and api_key not in ["False", "false", ""]
104 else None
105 )
107 # API configuration
108 self.api_base = "https://api.adsabs.harvard.edu/v1"
109 self.headers = {
110 "User-Agent": USER_AGENT,
111 "Accept": "application/json",
112 }
114 if self.api_key:
115 self.headers["Authorization"] = f"Bearer {self.api_key}"
116 logger.info("Using NASA ADS with API key")
117 else:
118 logger.error(
119 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token"
120 )
122 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
123 """
124 Get preview information for NASA ADS search results.
126 Args:
127 query: The search query (natural language supported)
129 Returns:
130 List of preview dictionaries
131 """
132 logger.info(f"Searching NASA ADS for: {query}")
134 # Build the search query - NASA ADS has good natural language support
135 # We can use the query directly or enhance it slightly
136 search_query = query
138 # Build filters
139 filters = []
140 if self.from_publication_date:
141 # Convert YYYY-MM-DD to ADS format
142 try:
143 year = self.from_publication_date.split("-")[0]
144 if year.isdigit(): # Only add if it's a valid year 144 ↛ 152line 144 didn't jump to line 152 because the condition on line 144 was always true
145 filters.append(f"year:{year}-9999")
146 except Exception:
147 logger.debug(
148 "best-effort date parsing, invalid formats skipped",
149 exc_info=True,
150 )
152 if self.min_citations > 0:
153 filters.append(f"citation_count:[{self.min_citations} TO *]")
155 if not self.include_arxiv:
156 filters.append('-bibstem:"arXiv"')
158 # Combine query with filters
159 if filters:
160 full_query = f"{search_query} {' '.join(filters)}"
161 else:
162 full_query = search_query
164 # Build request parameters
165 params = {
166 "q": full_query,
167 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff",
168 "rows": min(
169 self.max_results, 200
170 ), # NASA ADS allows up to 200 per request
171 "start": 0,
172 }
174 # Add sorting
175 sort_map = {
176 "relevance": "score desc",
177 "citation_count": "citation_count desc",
178 "date": "date desc",
179 }
180 params["sort"] = sort_map.get(self.sort_by, "score desc")
182 try:
183 # Apply rate limiting (simple like PubMed)
184 self._last_wait_time = self.rate_tracker.apply_rate_limit(
185 self.engine_type
186 )
187 logger.debug(
188 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
189 )
191 # Make the API request
192 logger.info(
193 f"Making NASA ADS API request with query: {str(params['q'])[:100]}..."
194 )
195 response = safe_get(
196 f"{self.api_base}/search/query",
197 params=params,
198 headers=self.headers,
199 timeout=30,
200 )
202 # Log rate limit headers if available
203 if "X-RateLimit-Remaining" in response.headers: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 remaining = response.headers.get("X-RateLimit-Remaining")
205 limit = response.headers.get("X-RateLimit-Limit", "unknown")
206 logger.debug(
207 f"NASA ADS rate limit: {remaining}/{limit} requests remaining"
208 )
210 if response.status_code == 200:
211 data = response.json()
212 docs = data.get("response", {}).get("docs", [])
213 num_found = data.get("response", {}).get("numFound", 0)
215 logger.info(
216 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})"
217 )
219 # Format results as previews
220 previews = []
221 for doc in docs:
222 preview = self._format_doc_preview(doc)
223 if preview: 223 ↛ 221line 223 didn't jump to line 221 because the condition on line 223 was always true
224 previews.append(preview)
226 logger.info(f"Successfully formatted {len(previews)} previews")
227 return previews
229 if response.status_code == 429:
230 # Rate limited
231 logger.warning("NASA ADS rate limit reached")
232 raise RateLimitError("NASA ADS rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry
234 if response.status_code == 401:
235 logger.error("NASA ADS API key is invalid or missing")
236 return []
238 logger.error(
239 f"NASA ADS API error: {response.status_code} - {response.text[:200]}"
240 )
241 return []
243 except RateLimitError:
244 # Re-raise rate limit errors for base class retry handling
245 raise
246 except Exception:
247 logger.exception("Error searching NASA ADS")
248 return []
250 def _format_doc_preview(
251 self, doc: Dict[str, Any]
252 ) -> Optional[Dict[str, Any]]:
253 """
254 Format a NASA ADS document as a preview dictionary.
256 Args:
257 doc: NASA ADS document object
259 Returns:
260 Formatted preview dictionary or None if formatting fails
261 """
262 try:
263 # Extract basic information
264 bibcode = doc.get("bibcode", "")
265 # Get title from list if available
266 title_list = doc.get("title", [])
267 title = title_list[0] if title_list else "No title"
269 # Get abstract or create snippet
270 abstract = doc.get("abstract", "")
271 snippet = (
272 abstract[:SNIPPET_LENGTH_LONG]
273 if abstract
274 else f"Academic paper: {title}"
275 )
277 # Get publication info
278 year = doc.get("year", "unknown")
279 pubdate = doc.get("pubdate", "unknown")
281 # Get journal/source
282 journal = "unknown"
283 if doc.get("pub"):
284 journal = str(doc.get("pub"))
285 elif doc.get("bibstem"):
286 bibstem = doc.get("bibstem", [])
287 if bibstem: 287 ↛ 293line 287 didn't jump to line 293 because the condition on line 287 was always true
288 journal = (
289 bibstem[0] if isinstance(bibstem, list) else bibstem
290 )
292 # Get authors
293 authors = doc.get("author", [])
294 authors_str = ", ".join(authors[:5])
295 if len(authors) > 5:
296 authors_str += " et al."
298 # NASA ADS returns each name as "Last, First" — emit a
299 # structured CSL list so the citation normalizer doesn't have
300 # to re-split the comma-joined display string above and
301 # mangle the family/given pairing in the process.
302 authors_csl: list[dict] = []
303 for raw in authors[:5]:
304 name = (raw or "").strip()
305 if not name: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 continue
307 if "," in name:
308 family, _, given = name.partition(",")
309 authors_csl.append(
310 {"family": family.strip(), "given": given.strip()}
311 )
312 else:
313 authors_csl.append({"literal": name})
315 # Get metrics
316 citation_count = doc.get("citation_count", 0)
318 # Get URL - prefer DOI, fallback to ADS URL
319 url = None
320 if doc.get("doi"):
321 dois = doc.get("doi", [])
322 if dois: 322 ↛ 326line 322 didn't jump to line 326 because the condition on line 322 was always true
323 doi = dois[0] if isinstance(dois, list) else dois
324 url = f"https://doi.org/{doi}"
326 if not url:
327 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}"
329 # Check if it's ArXiv
330 is_arxiv = "arXiv" in str(doc.get("bibstem", []))
332 # Get keywords
333 keywords = doc.get("keyword", [])
335 # Extract DOI for enrichment layer
336 doi_value = None
337 if doc.get("doi"):
338 dois = doc.get("doi", [])
339 if dois: 339 ↛ 342line 339 didn't jump to line 342 because the condition on line 339 was always true
340 doi_value = dois[0] if isinstance(dois, list) else dois
342 return {
343 "id": bibcode,
344 "title": title,
345 "link": url,
346 "snippet": snippet,
347 "authors": authors_str,
348 "authors_csl": authors_csl or None,
349 "year": year,
350 "date": pubdate,
351 # Both fields emit None (not the "unknown" sentinel) when
352 # no pub/bibstem is available. The "unknown" literal
353 # leaked through the normalizer's container_title fallback
354 # and even matched a real OpenAlex source named "unknown"
355 # (Q1, h_index=5) in the reference DB.
356 "journal": None if journal == "unknown" else journal,
357 # ArXiv preprints have pub="arXiv e-prints" — set journal_ref
358 # to None so the filter's preprint-handling path activates
359 # instead of trying to score "arXiv e-prints" as a journal.
360 "journal_ref": (
361 None if is_arxiv or journal == "unknown" else journal
362 ),
363 "doi": doi_value,
364 "citations": citation_count,
365 "abstract": abstract,
366 "is_arxiv": is_arxiv,
367 "keywords": keywords[:5] if keywords else [],
368 "type": "academic_paper",
369 }
371 except Exception:
372 logger.exception(
373 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}"
374 )
375 return None
377 def _get_full_content(
378 self, relevant_items: List[Dict[str, Any]]
379 ) -> List[Dict[str, Any]]:
380 """
381 Get full content for relevant items (NASA ADS provides most content in preview).
383 Args:
384 relevant_items: List of relevant preview dictionaries
386 Returns:
387 List of result dictionaries with full content
388 """
389 # NASA ADS returns comprehensive data in the initial search,
390 # so we don't need a separate full content fetch
391 results = []
392 for item in relevant_items:
393 result = {
394 "title": item.get("title", ""),
395 "link": item.get("link", ""),
396 "snippet": item.get("snippet", ""),
397 "content": item.get("abstract", item.get("snippet", "")),
398 # Forward journal quality fields for content filters
399 "journal_ref": item.get("journal_ref"),
400 "doi": item.get("doi"),
401 "metadata": {
402 "authors": item.get("authors", ""),
403 "year": item.get("year", ""),
404 "journal": item.get("journal", ""),
405 "citations": item.get("citations", 0),
406 "is_arxiv": item.get("is_arxiv", False),
407 "keywords": item.get("keywords", []),
408 "doi": item.get("doi"),
409 },
410 }
411 results.append(result)
413 return results