Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 89%
132 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""NASA Astrophysics Data System (ADS) search engine implementation."""
3from typing import Any, Dict, List, Optional
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...constants import SNIPPET_LENGTH_LONG
9from ...advanced_search_system.filters.journal_reputation_filter import (
10 JournalReputationFilter,
11)
12from ...security.safe_requests import safe_get
13from ..rate_limiting import RateLimitError
14from ..search_engine_base import BaseSearchEngine
17class NasaAdsSearchEngine(BaseSearchEngine):
18 """NASA ADS search engine for physics, astronomy, and astrophysics papers."""
20 # Mark as public search engine
21 is_public = True
22 # Scientific/astronomy/astrophysics search engine
23 is_scientific = True
24 is_lexical = True
25 needs_llm_relevance_filter = True
27 def __init__(
28 self,
29 max_results: int = 25,
30 api_key: Optional[str] = None,
31 sort_by: str = "relevance",
32 min_citations: int = 0,
33 from_publication_date: Optional[str] = None,
34 include_arxiv: bool = True,
35 llm: Optional[BaseLLM] = None,
36 max_filtered_results: Optional[int] = None,
37 settings_snapshot: Optional[Dict[str, Any]] = None,
38 **kwargs,
39 ):
40 """
41 Initialize the NASA ADS search engine.
43 Args:
44 max_results: Maximum number of search results
45 api_key: NASA ADS API key (required for higher rate limits)
46 sort_by: Sort order ('relevance', 'citation_count', 'date')
47 min_citations: Minimum citation count filter
48 from_publication_date: Filter papers from this date (YYYY-MM-DD)
49 include_arxiv: Include ArXiv preprints in results
50 llm: Language model for relevance filtering
51 max_filtered_results: Maximum number of results to keep after filtering
52 settings_snapshot: Settings snapshot for configuration
53 **kwargs: Additional parameters to pass to parent class
54 """
55 # Initialize journal reputation filter if needed
56 content_filters = []
57 journal_filter = JournalReputationFilter.create_default(
58 model=llm, # type: ignore[arg-type]
59 engine_name="nasa_ads",
60 settings_snapshot=settings_snapshot,
61 )
62 if journal_filter is not None:
63 content_filters.append(journal_filter)
65 # Initialize the BaseSearchEngine
66 super().__init__(
67 llm=llm,
68 max_filtered_results=max_filtered_results,
69 max_results=max_results,
70 content_filters=content_filters, # type: ignore[arg-type]
71 settings_snapshot=settings_snapshot,
72 **kwargs,
73 )
75 self.sort_by = sort_by
76 self.min_citations = min_citations
77 self.include_arxiv = include_arxiv
78 # Handle from_publication_date
79 self.from_publication_date = (
80 from_publication_date
81 if from_publication_date
82 and from_publication_date not in ["False", "false", ""]
83 else None
84 )
86 # Get API key from settings if not provided
87 if not api_key and settings_snapshot: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 from ...config.search_config import get_setting_from_snapshot
90 try:
91 api_key = get_setting_from_snapshot(
92 "search.engine.web.nasa_ads.api_key",
93 settings_snapshot=settings_snapshot,
94 )
95 except Exception:
96 logger.debug(
97 "Failed to read nasa_ads.api_key from settings snapshot",
98 exc_info=True,
99 )
101 # Handle "False" string for api_key
102 self.api_key = (
103 api_key
104 if api_key and api_key not in ["False", "false", ""]
105 else None
106 )
108 # API configuration
109 self.api_base = "https://api.adsabs.harvard.edu/v1"
110 self.headers = {
111 "User-Agent": "Local-Deep-Research-Agent",
112 "Accept": "application/json",
113 }
115 if self.api_key:
116 self.headers["Authorization"] = f"Bearer {self.api_key}"
117 logger.info("Using NASA ADS with API key")
118 else:
119 logger.error(
120 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token"
121 )
123 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
124 """
125 Get preview information for NASA ADS search results.
127 Args:
128 query: The search query (natural language supported)
130 Returns:
131 List of preview dictionaries
132 """
133 logger.info(f"Searching NASA ADS for: {query}")
135 # Build the search query - NASA ADS has good natural language support
136 # We can use the query directly or enhance it slightly
137 search_query = query
139 # Build filters
140 filters = []
141 if self.from_publication_date:
142 # Convert YYYY-MM-DD to ADS format
143 try:
144 year = self.from_publication_date.split("-")[0]
145 if year.isdigit(): # Only add if it's a valid year 145 ↛ 153line 145 didn't jump to line 153 because the condition on line 145 was always true
146 filters.append(f"year:{year}-9999")
147 except Exception:
148 logger.debug(
149 "best-effort date parsing, invalid formats skipped",
150 exc_info=True,
151 )
153 if self.min_citations > 0:
154 filters.append(f"citation_count:[{self.min_citations} TO *]")
156 if not self.include_arxiv:
157 filters.append('-bibstem:"arXiv"')
159 # Combine query with filters
160 if filters:
161 full_query = f"{search_query} {' '.join(filters)}"
162 else:
163 full_query = search_query
165 # Build request parameters
166 params = {
167 "q": full_query,
168 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff",
169 "rows": min(
170 self.max_results, 200
171 ), # NASA ADS allows up to 200 per request
172 "start": 0,
173 }
175 # Add sorting
176 sort_map = {
177 "relevance": "score desc",
178 "citation_count": "citation_count desc",
179 "date": "date desc",
180 }
181 params["sort"] = sort_map.get(self.sort_by, "score desc")
183 try:
184 # Apply rate limiting (simple like PubMed)
185 self._last_wait_time = self.rate_tracker.apply_rate_limit(
186 self.engine_type
187 )
188 logger.debug(
189 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
190 )
192 # Make the API request
193 logger.info(
194 f"Making NASA ADS API request with query: {str(params['q'])[:100]}..."
195 )
196 response = safe_get(
197 f"{self.api_base}/search/query",
198 params=params,
199 headers=self.headers,
200 timeout=30,
201 )
203 # Log rate limit headers if available
204 if "X-RateLimit-Remaining" in response.headers: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 remaining = response.headers.get("X-RateLimit-Remaining")
206 limit = response.headers.get("X-RateLimit-Limit", "unknown")
207 logger.debug(
208 f"NASA ADS rate limit: {remaining}/{limit} requests remaining"
209 )
211 if response.status_code == 200:
212 data = response.json()
213 docs = data.get("response", {}).get("docs", [])
214 num_found = data.get("response", {}).get("numFound", 0)
216 logger.info(
217 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})"
218 )
220 # Format results as previews
221 previews = []
222 for doc in docs:
223 preview = self._format_doc_preview(doc)
224 if preview: 224 ↛ 222line 224 didn't jump to line 222 because the condition on line 224 was always true
225 previews.append(preview)
227 logger.info(f"Successfully formatted {len(previews)} previews")
228 return previews
230 if response.status_code == 429:
231 # Rate limited
232 logger.warning("NASA ADS rate limit reached")
233 raise RateLimitError("NASA ADS rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry
235 if response.status_code == 401:
236 logger.error("NASA ADS API key is invalid or missing")
237 return []
239 logger.error(
240 f"NASA ADS API error: {response.status_code} - {response.text[:200]}"
241 )
242 return []
244 except RateLimitError:
245 # Re-raise rate limit errors for base class retry handling
246 raise
247 except Exception:
248 logger.exception("Error searching NASA ADS")
249 return []
251 def _format_doc_preview(
252 self, doc: Dict[str, Any]
253 ) -> Optional[Dict[str, Any]]:
254 """
255 Format a NASA ADS document as a preview dictionary.
257 Args:
258 doc: NASA ADS document object
260 Returns:
261 Formatted preview dictionary or None if formatting fails
262 """
263 try:
264 # Extract basic information
265 bibcode = doc.get("bibcode", "")
266 # Get title from list if available
267 title_list = doc.get("title", [])
268 title = title_list[0] if title_list else "No title"
270 # Get abstract or create snippet
271 abstract = doc.get("abstract", "")
272 snippet = (
273 abstract[:SNIPPET_LENGTH_LONG]
274 if abstract
275 else f"Academic paper: {title}"
276 )
278 # Get publication info
279 year = doc.get("year", "unknown")
280 pubdate = doc.get("pubdate", "unknown")
282 # Get journal/source
283 journal = "unknown"
284 if doc.get("pub"):
285 journal = str(doc.get("pub"))
286 elif doc.get("bibstem"):
287 bibstem = doc.get("bibstem", [])
288 if bibstem: 288 ↛ 294line 288 didn't jump to line 294 because the condition on line 288 was always true
289 journal = (
290 bibstem[0] if isinstance(bibstem, list) else bibstem
291 )
293 # Get authors
294 authors = doc.get("author", [])
295 authors_str = ", ".join(authors[:5])
296 if len(authors) > 5:
297 authors_str += " et al."
299 # Get metrics
300 citation_count = doc.get("citation_count", 0)
302 # Get URL - prefer DOI, fallback to ADS URL
303 url = None
304 if doc.get("doi"):
305 dois = doc.get("doi", [])
306 if dois: 306 ↛ 310line 306 didn't jump to line 310 because the condition on line 306 was always true
307 doi = dois[0] if isinstance(dois, list) else dois
308 url = f"https://doi.org/{doi}"
310 if not url:
311 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}"
313 # Check if it's ArXiv
314 is_arxiv = "arXiv" in str(doc.get("bibstem", []))
316 # Get keywords
317 keywords = doc.get("keyword", [])
319 return {
320 "id": bibcode,
321 "title": title,
322 "link": url,
323 "snippet": snippet,
324 "authors": authors_str,
325 "year": year,
326 "date": pubdate,
327 "journal": journal,
328 "citations": citation_count,
329 "abstract": abstract,
330 "is_arxiv": is_arxiv,
331 "keywords": keywords[:5] if keywords else [],
332 "type": "academic_paper",
333 }
335 except Exception:
336 logger.exception(
337 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}"
338 )
339 return None
341 def _get_full_content(
342 self, relevant_items: List[Dict[str, Any]]
343 ) -> List[Dict[str, Any]]:
344 """
345 Get full content for relevant items (NASA ADS provides most content in preview).
347 Args:
348 relevant_items: List of relevant preview dictionaries
350 Returns:
351 List of result dictionaries with full content
352 """
353 # NASA ADS returns comprehensive data in the initial search,
354 # so we don't need a separate full content fetch
355 results = []
356 for item in relevant_items:
357 result = {
358 "title": item.get("title", ""),
359 "link": item.get("link", ""),
360 "snippet": item.get("snippet", ""),
361 "content": item.get("abstract", item.get("snippet", "")),
362 "metadata": {
363 "authors": item.get("authors", ""),
364 "year": item.get("year", ""),
365 "journal": item.get("journal", ""),
366 "citations": item.get("citations", 0),
367 "is_arxiv": item.get("is_arxiv", False),
368 "keywords": item.get("keywords", []),
369 },
370 }
371 results.append(result)
373 return results