Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 88%
131 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""NASA Astrophysics Data System (ADS) search engine implementation."""
3from typing import Any, Dict, List, Optional
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...constants import SNIPPET_LENGTH_LONG
9from ...advanced_search_system.filters.journal_reputation_filter import (
10 JournalReputationFilter,
11)
12from ...security.safe_requests import safe_get
13from ..rate_limiting import RateLimitError
14from ..search_engine_base import BaseSearchEngine
17class NasaAdsSearchEngine(BaseSearchEngine):
18 """NASA ADS search engine for physics, astronomy, and astrophysics papers."""
20 # Mark as public search engine
21 is_public = True
22 # Scientific/astronomy/astrophysics search engine
23 is_scientific = True
25 def __init__(
26 self,
27 max_results: int = 25,
28 api_key: Optional[str] = None,
29 sort_by: str = "relevance",
30 min_citations: int = 0,
31 from_publication_date: Optional[str] = None,
32 include_arxiv: bool = True,
33 llm: Optional[BaseLLM] = None,
34 max_filtered_results: Optional[int] = None,
35 settings_snapshot: Optional[Dict[str, Any]] = None,
36 **kwargs,
37 ):
38 """
39 Initialize the NASA ADS search engine.
41 Args:
42 max_results: Maximum number of search results
43 api_key: NASA ADS API key (required for higher rate limits)
44 sort_by: Sort order ('relevance', 'citation_count', 'date')
45 min_citations: Minimum citation count filter
46 from_publication_date: Filter papers from this date (YYYY-MM-DD)
47 include_arxiv: Include ArXiv preprints in results
48 llm: Language model for relevance filtering
49 max_filtered_results: Maximum number of results to keep after filtering
50 settings_snapshot: Settings snapshot for configuration
51 **kwargs: Additional parameters to pass to parent class
52 """
53 # Initialize journal reputation filter if needed
54 content_filters = []
55 journal_filter = JournalReputationFilter.create_default(
56 model=llm,
57 engine_name="nasa_ads",
58 settings_snapshot=settings_snapshot,
59 )
60 if journal_filter is not None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 content_filters.append(journal_filter)
63 # Initialize the BaseSearchEngine
64 super().__init__(
65 llm=llm,
66 max_filtered_results=max_filtered_results,
67 max_results=max_results,
68 content_filters=content_filters,
69 settings_snapshot=settings_snapshot,
70 **kwargs,
71 )
73 self.sort_by = sort_by
74 self.min_citations = min_citations
75 self.include_arxiv = include_arxiv
76 # Handle from_publication_date
77 self.from_publication_date = (
78 from_publication_date
79 if from_publication_date
80 and from_publication_date not in ["False", "false", ""]
81 else None
82 )
84 # Get API key from settings if not provided
85 if not api_key and settings_snapshot: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 from ...config.search_config import get_setting_from_snapshot
88 try:
89 api_key = get_setting_from_snapshot(
90 "search.engine.web.nasa_ads.api_key",
91 settings_snapshot=settings_snapshot,
92 )
93 except Exception:
94 pass
96 # Handle "False" string for api_key
97 self.api_key = (
98 api_key
99 if api_key and api_key not in ["False", "false", ""]
100 else None
101 )
103 # API configuration
104 self.api_base = "https://api.adsabs.harvard.edu/v1"
105 self.headers = {
106 "User-Agent": "Local-Deep-Research-Agent",
107 "Accept": "application/json",
108 }
110 if self.api_key:
111 self.headers["Authorization"] = f"Bearer {self.api_key}"
112 logger.info("Using NASA ADS with API key")
113 else:
114 logger.error(
115 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token"
116 )
118 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
119 """
120 Get preview information for NASA ADS search results.
122 Args:
123 query: The search query (natural language supported)
125 Returns:
126 List of preview dictionaries
127 """
128 logger.info(f"Searching NASA ADS for: {query}")
130 # Build the search query - NASA ADS has good natural language support
131 # We can use the query directly or enhance it slightly
132 search_query = query
134 # Build filters
135 filters = []
136 if self.from_publication_date:
137 # Convert YYYY-MM-DD to ADS format
138 try:
139 year = self.from_publication_date.split("-")[0]
140 if year.isdigit(): # Only add if it's a valid year 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true
141 filters.append(f"year:{year}-9999")
142 except Exception:
143 pass # Skip invalid date formats
145 if self.min_citations > 0:
146 filters.append(f"citation_count:[{self.min_citations} TO *]")
148 if not self.include_arxiv:
149 filters.append('-bibstem:"arXiv"')
151 # Combine query with filters
152 if filters:
153 full_query = f"{search_query} {' '.join(filters)}"
154 else:
155 full_query = search_query
157 # Build request parameters
158 params = {
159 "q": full_query,
160 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff",
161 "rows": min(
162 self.max_results, 200
163 ), # NASA ADS allows up to 200 per request
164 "start": 0,
165 }
167 # Add sorting
168 sort_map = {
169 "relevance": "score desc",
170 "citation_count": "citation_count desc",
171 "date": "date desc",
172 }
173 params["sort"] = sort_map.get(self.sort_by, "score desc")
175 try:
176 # Apply rate limiting (simple like PubMed)
177 self._last_wait_time = self.rate_tracker.apply_rate_limit(
178 self.engine_type
179 )
180 logger.debug(
181 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
182 )
184 # Make the API request
185 logger.info(
186 f"Making NASA ADS API request with query: {params['q'][:100]}..."
187 )
188 response = safe_get(
189 f"{self.api_base}/search/query",
190 params=params,
191 headers=self.headers,
192 timeout=30,
193 )
195 # Log rate limit headers if available
196 if "X-RateLimit-Remaining" in response.headers: 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true
197 remaining = response.headers.get("X-RateLimit-Remaining")
198 limit = response.headers.get("X-RateLimit-Limit", "unknown")
199 logger.debug(
200 f"NASA ADS rate limit: {remaining}/{limit} requests remaining"
201 )
203 if response.status_code == 200:
204 data = response.json()
205 docs = data.get("response", {}).get("docs", [])
206 num_found = data.get("response", {}).get("numFound", 0)
208 logger.info(
209 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})"
210 )
212 # Format results as previews
213 previews = []
214 for doc in docs:
215 preview = self._format_doc_preview(doc)
216 if preview: 216 ↛ 214line 216 didn't jump to line 214 because the condition on line 216 was always true
217 previews.append(preview)
219 logger.info(f"Successfully formatted {len(previews)} previews")
220 return previews
222 elif response.status_code == 429:
223 # Rate limited
224 logger.warning("NASA ADS rate limit reached")
225 raise RateLimitError("NASA ADS rate limit exceeded")
227 elif response.status_code == 401:
228 logger.error("NASA ADS API key is invalid or missing")
229 return []
231 else:
232 logger.error(
233 f"NASA ADS API error: {response.status_code} - {response.text[:200]}"
234 )
235 return []
237 except RateLimitError:
238 # Re-raise rate limit errors for base class retry handling
239 raise
240 except Exception:
241 logger.exception("Error searching NASA ADS")
242 return []
244 def _format_doc_preview(
245 self, doc: Dict[str, Any]
246 ) -> Optional[Dict[str, Any]]:
247 """
248 Format a NASA ADS document as a preview dictionary.
250 Args:
251 doc: NASA ADS document object
253 Returns:
254 Formatted preview dictionary or None if formatting fails
255 """
256 try:
257 # Extract basic information
258 bibcode = doc.get("bibcode", "")
259 # Get title from list if available
260 title_list = doc.get("title", [])
261 title = title_list[0] if title_list else "No title"
263 # Get abstract or create snippet
264 abstract = doc.get("abstract", "")
265 snippet = (
266 abstract[:SNIPPET_LENGTH_LONG]
267 if abstract
268 else f"Academic paper: {title}"
269 )
271 # Get publication info
272 year = doc.get("year", "unknown")
273 pubdate = doc.get("pubdate", "unknown")
275 # Get journal/source
276 journal = "unknown"
277 if doc.get("pub"):
278 journal = doc.get("pub")
279 elif doc.get("bibstem"):
280 bibstem = doc.get("bibstem", [])
281 if bibstem: 281 ↛ 287line 281 didn't jump to line 287 because the condition on line 281 was always true
282 journal = (
283 bibstem[0] if isinstance(bibstem, list) else bibstem
284 )
286 # Get authors
287 authors = doc.get("author", [])
288 authors_str = ", ".join(authors[:5])
289 if len(authors) > 5:
290 authors_str += " et al."
292 # Get metrics
293 citation_count = doc.get("citation_count", 0)
295 # Get URL - prefer DOI, fallback to ADS URL
296 url = None
297 if doc.get("doi"):
298 dois = doc.get("doi", [])
299 if dois: 299 ↛ 303line 299 didn't jump to line 303 because the condition on line 299 was always true
300 doi = dois[0] if isinstance(dois, list) else dois
301 url = f"https://doi.org/{doi}"
303 if not url:
304 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}"
306 # Check if it's ArXiv
307 is_arxiv = "arXiv" in str(doc.get("bibstem", []))
309 # Get keywords
310 keywords = doc.get("keyword", [])
312 preview = {
313 "id": bibcode,
314 "title": title,
315 "link": url,
316 "snippet": snippet,
317 "authors": authors_str,
318 "year": year,
319 "date": pubdate,
320 "journal": journal,
321 "citations": citation_count,
322 "abstract": abstract,
323 "is_arxiv": is_arxiv,
324 "keywords": keywords[:5] if keywords else [],
325 "type": "academic_paper",
326 }
328 return preview
330 except Exception:
331 logger.exception(
332 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}"
333 )
334 return None
336 def _get_full_content(
337 self, relevant_items: List[Dict[str, Any]]
338 ) -> List[Dict[str, Any]]:
339 """
340 Get full content for relevant items (NASA ADS provides most content in preview).
342 Args:
343 relevant_items: List of relevant preview dictionaries
345 Returns:
346 List of result dictionaries with full content
347 """
348 # NASA ADS returns comprehensive data in the initial search,
349 # so we don't need a separate full content fetch
350 results = []
351 for item in relevant_items:
352 result = {
353 "title": item.get("title", ""),
354 "link": item.get("link", ""),
355 "snippet": item.get("snippet", ""),
356 "content": item.get("abstract", item.get("snippet", "")),
357 "metadata": {
358 "authors": item.get("authors", ""),
359 "year": item.get("year", ""),
360 "journal": item.get("journal", ""),
361 "citations": item.get("citations", 0),
362 "is_arxiv": item.get("is_arxiv", False),
363 "keywords": item.get("keywords", []),
364 },
365 }
366 results.append(result)
368 return results