Coverage for src / local_deep_research / web_search_engines / engines / search_engine_nasa_ads.py: 84%
130 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""NASA Astrophysics Data System (ADS) search engine implementation."""
3from typing import Any, Dict, List, Optional
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...advanced_search_system.filters.journal_reputation_filter import (
9 JournalReputationFilter,
10)
11from ...security.safe_requests import safe_get
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class NasaAdsSearchEngine(BaseSearchEngine):
17 """NASA ADS search engine for physics, astronomy, and astrophysics papers."""
19 # Mark as public search engine
20 is_public = True
21 # Scientific/astronomy/astrophysics search engine
22 is_scientific = True
24 def __init__(
25 self,
26 max_results: int = 25,
27 api_key: Optional[str] = None,
28 sort_by: str = "relevance",
29 min_citations: int = 0,
30 from_publication_date: Optional[str] = None,
31 include_arxiv: bool = True,
32 llm: Optional[BaseLLM] = None,
33 max_filtered_results: Optional[int] = None,
34 settings_snapshot: Optional[Dict[str, Any]] = None,
35 **kwargs,
36 ):
37 """
38 Initialize the NASA ADS search engine.
40 Args:
41 max_results: Maximum number of search results
42 api_key: NASA ADS API key (required for higher rate limits)
43 sort_by: Sort order ('relevance', 'citation_count', 'date')
44 min_citations: Minimum citation count filter
45 from_publication_date: Filter papers from this date (YYYY-MM-DD)
46 include_arxiv: Include ArXiv preprints in results
47 llm: Language model for relevance filtering
48 max_filtered_results: Maximum number of results to keep after filtering
49 settings_snapshot: Settings snapshot for configuration
50 **kwargs: Additional parameters to pass to parent class
51 """
52 # Initialize journal reputation filter if needed
53 content_filters = []
54 journal_filter = JournalReputationFilter.create_default(
55 model=llm,
56 engine_name="nasa_ads",
57 settings_snapshot=settings_snapshot,
58 )
59 if journal_filter is not None:
60 content_filters.append(journal_filter)
62 # Initialize the BaseSearchEngine
63 super().__init__(
64 llm=llm,
65 max_filtered_results=max_filtered_results,
66 max_results=max_results,
67 content_filters=content_filters,
68 settings_snapshot=settings_snapshot,
69 **kwargs,
70 )
72 self.sort_by = sort_by
73 self.min_citations = min_citations
74 self.include_arxiv = include_arxiv
75 # Handle from_publication_date
76 self.from_publication_date = (
77 from_publication_date
78 if from_publication_date
79 and from_publication_date not in ["False", "false", ""]
80 else None
81 )
83 # Get API key from settings if not provided
84 if not api_key and settings_snapshot:
85 from ...config.search_config import get_setting_from_snapshot
87 try:
88 api_key = get_setting_from_snapshot(
89 "search.engine.web.nasa_ads.api_key",
90 settings_snapshot=settings_snapshot,
91 )
92 except Exception:
93 pass
95 # Handle "False" string for api_key
96 self.api_key = (
97 api_key
98 if api_key and api_key not in ["False", "false", ""]
99 else None
100 )
102 # API configuration
103 self.api_base = "https://api.adsabs.harvard.edu/v1"
104 self.headers = {
105 "User-Agent": "Local-Deep-Research-Agent",
106 "Accept": "application/json",
107 }
109 if self.api_key:
110 self.headers["Authorization"] = f"Bearer {self.api_key}"
111 logger.info("Using NASA ADS with API key")
112 else:
113 logger.error(
114 "NASA ADS requires an API key to function. Get a free key at: https://ui.adsabs.harvard.edu/user/settings/token"
115 )
117 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
118 """
119 Get preview information for NASA ADS search results.
121 Args:
122 query: The search query (natural language supported)
124 Returns:
125 List of preview dictionaries
126 """
127 logger.info(f"Searching NASA ADS for: {query}")
129 # Build the search query - NASA ADS has good natural language support
130 # We can use the query directly or enhance it slightly
131 search_query = query
133 # Build filters
134 filters = []
135 if self.from_publication_date: 135 ↛ 137line 135 didn't jump to line 137 because the condition on line 135 was never true
136 # Convert YYYY-MM-DD to ADS format
137 try:
138 year = self.from_publication_date.split("-")[0]
139 if year.isdigit(): # Only add if it's a valid year
140 filters.append(f"year:{year}-9999")
141 except Exception:
142 pass # Skip invalid date formats
144 if self.min_citations > 0: 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 filters.append(f"citation_count:[{self.min_citations} TO *]")
147 if not self.include_arxiv: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 filters.append('-bibstem:"arXiv"')
150 # Combine query with filters
151 if filters: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 full_query = f"{search_query} {' '.join(filters)}"
153 else:
154 full_query = search_query
156 # Build request parameters
157 params = {
158 "q": full_query,
159 "fl": "id,bibcode,title,author,year,pubdate,abstract,citation_count,bibstem,doi,identifier,pub,keyword,aff",
160 "rows": min(
161 self.max_results, 200
162 ), # NASA ADS allows up to 200 per request
163 "start": 0,
164 }
166 # Add sorting
167 sort_map = {
168 "relevance": "score desc",
169 "citation_count": "citation_count desc",
170 "date": "date desc",
171 }
172 params["sort"] = sort_map.get(self.sort_by, "score desc")
174 try:
175 # Apply rate limiting (simple like PubMed)
176 self._last_wait_time = self.rate_tracker.apply_rate_limit(
177 self.engine_type
178 )
179 logger.debug(
180 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
181 )
183 # Make the API request
184 logger.info(
185 f"Making NASA ADS API request with query: {params['q'][:100]}..."
186 )
187 response = safe_get(
188 f"{self.api_base}/search/query",
189 params=params,
190 headers=self.headers,
191 timeout=30,
192 )
194 # Log rate limit headers if available
195 if "X-RateLimit-Remaining" in response.headers: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 remaining = response.headers.get("X-RateLimit-Remaining")
197 limit = response.headers.get("X-RateLimit-Limit", "unknown")
198 logger.debug(
199 f"NASA ADS rate limit: {remaining}/{limit} requests remaining"
200 )
202 if response.status_code == 200:
203 data = response.json()
204 docs = data.get("response", {}).get("docs", [])
205 num_found = data.get("response", {}).get("numFound", 0)
207 logger.info(
208 f"NASA ADS returned {len(docs)} results (total available: {num_found:,})"
209 )
211 # Format results as previews
212 previews = []
213 for doc in docs:
214 preview = self._format_doc_preview(doc)
215 if preview: 215 ↛ 213line 215 didn't jump to line 213 because the condition on line 215 was always true
216 previews.append(preview)
218 logger.info(f"Successfully formatted {len(previews)} previews")
219 return previews
221 elif response.status_code == 429:
222 # Rate limited
223 logger.warning("NASA ADS rate limit reached")
224 raise RateLimitError("NASA ADS rate limit exceeded")
226 elif response.status_code == 401:
227 logger.error("NASA ADS API key is invalid or missing")
228 return []
230 else:
231 logger.error(
232 f"NASA ADS API error: {response.status_code} - {response.text[:200]}"
233 )
234 return []
236 except RateLimitError:
237 # Re-raise rate limit errors for base class retry handling
238 raise
239 except Exception:
240 logger.exception("Error searching NASA ADS")
241 return []
243 def _format_doc_preview(
244 self, doc: Dict[str, Any]
245 ) -> Optional[Dict[str, Any]]:
246 """
247 Format a NASA ADS document as a preview dictionary.
249 Args:
250 doc: NASA ADS document object
252 Returns:
253 Formatted preview dictionary or None if formatting fails
254 """
255 try:
256 # Extract basic information
257 bibcode = doc.get("bibcode", "")
258 # Get title from list if available
259 title_list = doc.get("title", [])
260 title = title_list[0] if title_list else "No title"
262 # Get abstract or create snippet
263 abstract = doc.get("abstract", "")
264 snippet = abstract[:500] if abstract else f"Academic paper: {title}"
266 # Get publication info
267 year = doc.get("year", "unknown")
268 pubdate = doc.get("pubdate", "unknown")
270 # Get journal/source
271 journal = "unknown"
272 if doc.get("pub"):
273 journal = doc.get("pub")
274 elif doc.get("bibstem"):
275 bibstem = doc.get("bibstem", [])
276 if bibstem: 276 ↛ 282line 276 didn't jump to line 282 because the condition on line 276 was always true
277 journal = (
278 bibstem[0] if isinstance(bibstem, list) else bibstem
279 )
281 # Get authors
282 authors = doc.get("author", [])
283 authors_str = ", ".join(authors[:5])
284 if len(authors) > 5:
285 authors_str += " et al."
287 # Get metrics
288 citation_count = doc.get("citation_count", 0)
290 # Get URL - prefer DOI, fallback to ADS URL
291 url = None
292 if doc.get("doi"):
293 dois = doc.get("doi", [])
294 if dois: 294 ↛ 298line 294 didn't jump to line 298 because the condition on line 294 was always true
295 doi = dois[0] if isinstance(dois, list) else dois
296 url = f"https://doi.org/{doi}"
298 if not url:
299 url = f"https://ui.adsabs.harvard.edu/abs/{bibcode}"
301 # Check if it's ArXiv
302 is_arxiv = "arXiv" in str(doc.get("bibstem", []))
304 # Get keywords
305 keywords = doc.get("keyword", [])
307 preview = {
308 "id": bibcode,
309 "title": title,
310 "link": url,
311 "snippet": snippet,
312 "authors": authors_str,
313 "year": year,
314 "date": pubdate,
315 "journal": journal,
316 "citations": citation_count,
317 "abstract": abstract,
318 "is_arxiv": is_arxiv,
319 "keywords": keywords[:5] if keywords else [],
320 "type": "academic_paper",
321 }
323 return preview
325 except Exception:
326 logger.exception(
327 f"Error formatting NASA ADS document: {doc.get('bibcode', 'unknown')}"
328 )
329 return None
331 def _get_full_content(
332 self, relevant_items: List[Dict[str, Any]]
333 ) -> List[Dict[str, Any]]:
334 """
335 Get full content for relevant items (NASA ADS provides most content in preview).
337 Args:
338 relevant_items: List of relevant preview dictionaries
340 Returns:
341 List of result dictionaries with full content
342 """
343 # NASA ADS returns comprehensive data in the initial search,
344 # so we don't need a separate full content fetch
345 results = []
346 for item in relevant_items:
347 result = {
348 "title": item.get("title", ""),
349 "link": item.get("link", ""),
350 "snippet": item.get("snippet", ""),
351 "content": item.get("abstract", item.get("snippet", "")),
352 "metadata": {
353 "authors": item.get("authors", ""),
354 "year": item.get("year", ""),
355 "journal": item.get("journal", ""),
356 "citations": item.get("citations", 0),
357 "is_arxiv": item.get("is_arxiv", False),
358 "keywords": item.get("keywords", []),
359 },
360 }
361 results.append(result)
363 return results