Coverage for src / local_deep_research / web_search_engines / engines / search_engine_semantic_scholar.py: 66%
255 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4import requests
5from langchain_core.language_models import BaseLLM
6from loguru import logger
7from requests.adapters import HTTPAdapter
8from urllib3.util import Retry
10from ...constants import SNIPPET_LENGTH_SHORT
11from ..rate_limiting import RateLimitError
12from ..search_engine_base import BaseSearchEngine
13from ...security import SafeSession
16class SemanticScholarSearchEngine(BaseSearchEngine):
17 """
18 Semantic Scholar search engine implementation with two-phase approach.
19 Provides efficient access to scientific literature across all fields.
20 """
22 # Mark as public search engine
23 is_public = True
24 # Scientific/academic search engine
25 is_scientific = True
27 def __init__(
28 self,
29 max_results: int = 10,
30 api_key: Optional[str] = None,
31 year_range: Optional[Tuple[int, int]] = None,
32 get_abstracts: bool = True,
33 get_references: bool = False,
34 get_citations: bool = False,
35 get_embeddings: bool = False,
36 get_tldr: bool = True,
37 citation_limit: int = 10,
38 reference_limit: int = 10,
39 llm: Optional[BaseLLM] = None,
40 max_filtered_results: Optional[int] = None,
41 optimize_queries: bool = True,
42 max_retries: int = 5,
43 retry_backoff_factor: float = 1.0,
44 fields_of_study: Optional[List[str]] = None,
45 publication_types: Optional[List[str]] = None,
46 settings_snapshot: Optional[Dict[str, Any]] = None,
47 **kwargs,
48 ):
49 """
50 Initialize the Semantic Scholar search engine.
52 Args:
53 max_results: Maximum number of search results
54 api_key: Semantic Scholar API key for higher rate limits (optional)
55 year_range: Optional tuple of (start_year, end_year) to filter results
56 get_abstracts: Whether to fetch abstracts for all results
57 get_references: Whether to fetch references for papers
58 get_citations: Whether to fetch citations for papers
59 get_embeddings: Whether to fetch SPECTER embeddings for papers
60 get_tldr: Whether to fetch TLDR summaries for papers
61 citation_limit: Maximum number of citations to fetch per paper
62 reference_limit: Maximum number of references to fetch per paper
63 llm: Language model for relevance filtering
64 max_filtered_results: Maximum number of results to keep after filtering
65 optimize_queries: Whether to optimize natural language queries
66 max_retries: Maximum number of retries for API requests
67 retry_backoff_factor: Backoff factor for retries
68 fields_of_study: List of fields of study to filter results
69 publication_types: List of publication types to filter results
70 settings_snapshot: Settings snapshot for configuration
71 **kwargs: Additional parameters to pass to parent class
72 """
73 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
74 super().__init__(
75 llm=llm,
76 max_filtered_results=max_filtered_results,
77 max_results=max_results,
78 settings_snapshot=settings_snapshot,
79 **kwargs,
80 )
82 # Get API key from settings if not provided
83 if not api_key and settings_snapshot:
84 from ...config.search_config import get_setting_from_snapshot
86 try:
87 api_key = get_setting_from_snapshot(
88 "search.engine.web.semantic_scholar.api_key",
89 settings_snapshot=settings_snapshot,
90 )
91 except Exception:
92 pass
94 self.api_key = api_key
95 self.year_range = year_range
96 self.get_abstracts = get_abstracts
97 self.get_references = get_references
98 self.get_citations = get_citations
99 self.get_embeddings = get_embeddings
100 self.get_tldr = get_tldr
101 self.citation_limit = citation_limit
102 self.reference_limit = reference_limit
103 self.optimize_queries = optimize_queries
104 self.max_retries = max_retries
105 self.retry_backoff_factor = retry_backoff_factor
106 self.fields_of_study = (
107 self._ensure_list(fields_of_study)
108 if fields_of_study is not None
109 else None
110 )
111 self.publication_types = (
112 self._ensure_list(publication_types)
113 if publication_types is not None
114 else None
115 )
117 # Base API URLs
118 self.base_url = "https://api.semanticscholar.org/graph/v1"
119 self.paper_search_url = f"{self.base_url}/paper/search"
120 self.paper_details_url = f"{self.base_url}/paper"
122 # Create a session with retry capabilities
123 self.session = self._create_session()
125 # Log API key status
126 if self.api_key:
127 logger.info(
128 "Using Semantic Scholar with API key (higher rate limits)"
129 )
130 else:
131 logger.info(
132 "Using Semantic Scholar without API key (lower rate limits)"
133 )
135 def _create_session(self) -> SafeSession:
136 """Create and configure a requests session with retry capabilities"""
137 session = SafeSession()
139 # Configure automatic retries with exponential backoff
140 retry_strategy = Retry(
141 total=self.max_retries,
142 backoff_factor=self.retry_backoff_factor,
143 status_forcelist=[429, 500, 502, 503, 504],
144 allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},
145 )
147 adapter = HTTPAdapter(max_retries=retry_strategy)
148 session.mount("https://", adapter)
150 # Set up headers
151 headers = {"Accept": "application/json"}
152 if self.api_key:
153 headers["x-api-key"] = self.api_key
155 session.headers.update(headers)
157 return session
159 def close(self):
160 """
161 Close the HTTP session and clean up resources.
163 Call this method when done using the search engine to prevent
164 connection/file descriptor leaks.
165 """
166 if hasattr(self, "session") and self.session: 166 ↛ exitline 166 didn't return from function 'close' because the condition on line 166 was always true
167 try:
168 self.session.close()
169 except Exception:
170 logger.exception("Error closing SemanticScholar session")
171 finally:
172 self.session = None
174 def __del__(self):
175 """Destructor to ensure session is closed."""
176 self.close()
178 def __enter__(self):
179 """Context manager entry."""
180 return self
182 def __exit__(self, exc_type, exc_val, exc_tb):
183 """Context manager exit - ensures session cleanup."""
184 self.close()
185 return False
187 def _respect_rate_limit(self):
188 """Apply rate limiting between requests"""
189 # Apply rate limiting before request
190 self._last_wait_time = self.rate_tracker.apply_rate_limit(
191 self.engine_type
192 )
193 logger.debug(f"Applied rate limit wait: {self._last_wait_time:.2f}s")
195 def _make_request(
196 self,
197 url: str,
198 params: Optional[Dict] = None,
199 data: Optional[Dict] = None,
200 method: str = "GET",
201 ) -> Dict:
202 """
203 Make a request to the Semantic Scholar API.
205 Args:
206 url: API endpoint URL
207 params: Query parameters
208 data: JSON data for POST requests
209 method: HTTP method (GET or POST)
211 Returns:
212 API response as dictionary
213 """
214 self._respect_rate_limit()
216 try:
217 if method.upper() == "GET":
218 response = self.session.get(url, params=params, timeout=30)
219 elif method.upper() == "POST":
220 response = self.session.post(
221 url, params=params, json=data, timeout=30
222 )
223 else:
224 raise ValueError(f"Unsupported HTTP method: {method}")
226 # Handle rate limiting
227 if response.status_code == 429:
228 logger.warning("Semantic Scholar rate limit exceeded")
229 raise RateLimitError("Semantic Scholar rate limit exceeded")
231 response.raise_for_status()
232 return response.json()
233 except requests.RequestException:
234 logger.exception("API request failed")
235 return {}
237 def _optimize_query(self, query: str) -> str:
238 """
239 Optimize a natural language query for Semantic Scholar search.
240 If LLM is available, uses it to extract key terms and concepts.
242 Args:
243 query: Natural language query
245 Returns:
246 Optimized query string
247 """
248 if not self.llm or not self.optimize_queries:
249 return query
251 try:
252 prompt = f"""Transform this natural language question into an optimized academic search query.
254Original query: "{query}"
256INSTRUCTIONS:
2571. Extract key academic concepts, technical terms, and proper nouns
2582. Remove generic words, filler words, and non-technical terms
2593. Add quotation marks around specific phrases that should be kept together
2604. Return ONLY the optimized search query with no explanation
2615. Keep it under 100 characters if possible
263EXAMPLE TRANSFORMATIONS:
264"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"
265"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"
266"Tell me about quantum computing approaches for encryption" → "quantum computing encryption"
268Return ONLY the optimized search query with no explanation.
269"""
271 response = self.llm.invoke(prompt)
272 optimized_query = response.content.strip()
274 # Clean up the query - remove any explanations
275 lines = optimized_query.split("\n")
276 optimized_query = lines[0].strip()
278 # Safety check - if query looks too much like an explanation, use original
279 if len(optimized_query.split()) > 15 or ":" in optimized_query: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true
280 logger.warning(
281 "Query optimization result looks too verbose, using original"
282 )
283 return query
285 logger.info(f"Original query: '{query}'")
286 logger.info(f"Optimized for search: '{optimized_query}'")
288 return optimized_query
289 except Exception:
290 logger.exception("Error optimizing query")
291 return query # Fall back to original query on error
293 def _direct_search(self, query: str) -> List[Dict[str, Any]]:
294 """
295 Make a direct search request to the Semantic Scholar API.
297 Args:
298 query: The search query
300 Returns:
301 List of paper dictionaries
302 """
303 try:
304 # Configure fields to retrieve
305 fields = [
306 "paperId",
307 "externalIds",
308 "url",
309 "title",
310 "abstract",
311 "venue",
312 "year",
313 "authors",
314 "citationCount", # Add citation count for ranking
315 "openAccessPdf", # PDF URL for open access papers
316 ]
318 if self.get_tldr: 318 ↛ 321line 318 didn't jump to line 321 because the condition on line 318 was always true
319 fields.append("tldr")
321 params = {
322 "query": query,
323 "limit": min(
324 self.max_results, 100
325 ), # API limit is 100 per request
326 "fields": ",".join(fields),
327 }
329 # Add year filter if specified
330 if self.year_range:
331 start_year, end_year = self.year_range
332 params["year"] = f"{start_year}-{end_year}"
334 # Add fields of study filter if specified
335 if self.fields_of_study: 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true
336 params["fieldsOfStudy"] = ",".join(self.fields_of_study)
338 # Add publication types filter if specified
339 if self.publication_types: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 params["publicationTypes"] = ",".join(self.publication_types)
342 response = self._make_request(self.paper_search_url, params)
344 if "data" in response:
345 papers = response["data"]
346 logger.info(
347 f"Found {len(papers)} papers with direct search for query: '{query}'"
348 )
349 return papers
350 else:
351 logger.warning(
352 f"No data in response for direct search query: '{query}'"
353 )
354 return []
356 except Exception:
357 logger.exception("Error in direct search")
358 return []
360 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
361 """
362 Perform an adaptive search that adjusts based on result volume.
363 Uses LLM to generate better fallback queries when available.
365 Args:
366 query: The search query
368 Returns:
369 Tuple of (list of paper results, search strategy used)
370 """
371 # Start with a standard search
372 papers = self._direct_search(query)
373 strategy = "standard"
375 # If no results, try different variations
376 if not papers:
377 # Try removing quotes to broaden search
378 if '"' in query: 378 ↛ 391line 378 didn't jump to line 391 because the condition on line 378 was always true
379 unquoted_query = query.replace('"', "")
380 logger.info(
381 "No results with quoted terms, trying without quotes: %s",
382 unquoted_query,
383 )
384 papers = self._direct_search(unquoted_query)
386 if papers: 386 ↛ 391line 386 didn't jump to line 391 because the condition on line 386 was always true
387 strategy = "unquoted"
388 return papers, strategy
390 # If LLM is available, use it to generate better fallback queries
391 if self.llm:
392 try:
393 # Generate alternate search queries focusing on core concepts
394 prompt = f"""You are helping refine a search query that returned no results.
396Original query: "{query}"
398The query might be too specific or use natural language phrasing that doesn't match academic paper keywords.
400Please provide THREE alternative search queries that:
4011. Focus on the core academic concepts
4022. Use precise terminology commonly found in academic papers
4033. Break down complex queries into more searchable components
4044. Format each as a concise keyword-focused search term (not a natural language question)
406Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.
407"""
408 # Get the LLM's response
409 response = self.llm.invoke(prompt)
411 # Extract the alternative queries
412 alt_queries = []
413 if hasattr(
414 response, "content"
415 ): # Handle various LLM response formats
416 content = response.content
417 alt_queries = [
418 q.strip()
419 for q in content.strip().split("\n")
420 if q.strip()
421 ]
422 elif isinstance(response, str):
423 alt_queries = [
424 q.strip()
425 for q in response.strip().split("\n")
426 if q.strip()
427 ]
429 # Try each alternative query
430 for alt_query in alt_queries[
431 :3
432 ]: # Limit to first 3 alternatives
433 logger.info("Trying LLM-suggested query: %s", alt_query)
434 alt_papers = self._direct_search(alt_query)
436 if alt_papers:
437 logger.info(
438 "Found %s papers using LLM-suggested query: %s",
439 len(alt_papers),
440 alt_query,
441 )
442 strategy = "llm_alternative"
443 return alt_papers, strategy
444 except Exception:
445 logger.exception("Error using LLM for query refinement")
446 # Fall through to simpler strategies
448 # Fallback: Try with the longest words (likely specific terms)
449 words = re.findall(r"\w+", query)
450 longer_words = [word for word in words if len(word) > 6]
451 if longer_words:
452 # Use up to 3 of the longest words
453 longer_words = sorted(longer_words, key=len, reverse=True)[:3]
454 key_terms_query = " ".join(longer_words)
455 logger.info("Trying with key terms: %s", key_terms_query)
456 papers = self._direct_search(key_terms_query)
458 if papers:
459 strategy = "key_terms"
460 return papers, strategy
462 # Final fallback: Try with just the longest word
463 if words:
464 longest_word = max(words, key=len)
465 if len(longest_word) > 5: # Only use if it's reasonably long
466 logger.info("Trying with single key term: %s", longest_word)
467 papers = self._direct_search(longest_word)
469 if papers:
470 strategy = "single_term"
471 return papers, strategy
473 return papers, strategy
475 def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
476 """
477 Get detailed information about a specific paper.
479 Args:
480 paper_id: Semantic Scholar Paper ID
482 Returns:
483 Dictionary with paper details
484 """
485 try:
486 # Construct fields parameter
487 fields = [
488 "paperId",
489 "externalIds",
490 "corpusId",
491 "url",
492 "title",
493 "abstract",
494 "venue",
495 "year",
496 "authors",
497 "fieldsOfStudy",
498 "citationCount", # Add citation count
499 ]
501 if self.get_tldr:
502 fields.append("tldr")
504 if self.get_embeddings:
505 fields.append("embedding")
507 # Add citation and reference fields if requested
508 if self.get_citations:
509 fields.append(f"citations.limit({self.citation_limit})")
511 if self.get_references:
512 fields.append(f"references.limit({self.reference_limit})")
514 # Make the request
515 url = f"{self.paper_details_url}/{paper_id}"
516 params = {"fields": ",".join(fields)}
518 return self._make_request(url, params)
520 except Exception:
521 logger.exception("Error getting paper details for paper")
522 return {}
524 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
525 """
526 Get preview information for Semantic Scholar papers.
528 Args:
529 query: The search query
531 Returns:
532 List of preview dictionaries
533 """
534 logger.info(f"Getting Semantic Scholar previews for query: {query}")
536 # Optimize the query if LLM is available
537 optimized_query = self._optimize_query(query)
539 # Use the adaptive search approach
540 papers, strategy = self._adaptive_search(optimized_query)
542 if not papers:
543 logger.warning("No Semantic Scholar results found")
544 return []
546 # Format as previews
547 previews = []
548 for paper in papers:
549 try:
550 # Format authors - ensure we have a valid list with string values
551 authors = []
552 if paper.get("authors"):
553 authors = [
554 author.get("name", "")
555 for author in paper["authors"]
556 if author and author.get("name")
557 ]
559 # Ensure we have valid strings for all fields
560 paper_id = paper.get("paperId", "")
561 title = paper.get("title", "")
562 url = paper.get("url", "")
564 # Handle abstract safely, ensuring we always have a string
565 abstract = paper.get("abstract")
566 snippet = ""
567 if abstract:
568 snippet = (
569 abstract[:SNIPPET_LENGTH_SHORT] + "..."
570 if len(abstract) > SNIPPET_LENGTH_SHORT
571 else abstract
572 )
574 venue = paper.get("venue", "")
575 year = paper.get("year")
576 external_ids = paper.get("externalIds", {})
578 # Handle TLDR safely
579 tldr_text = ""
580 if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
581 tldr_text = paper.get("tldr", {}).get("text", "")
583 # Create preview with basic information, ensuring no None values
584 preview = {
585 "id": paper_id if paper_id else "",
586 "title": title if title else "",
587 "link": url if url else "",
588 "snippet": snippet,
589 "authors": authors,
590 "venue": venue if venue else "",
591 "year": year,
592 "external_ids": external_ids if external_ids else {},
593 "source": "Semantic Scholar",
594 "_paper_id": paper_id if paper_id else "",
595 "_search_strategy": strategy,
596 "tldr": tldr_text,
597 }
599 # Store the full paper object for later reference
600 preview["_full_paper"] = paper
602 previews.append(preview)
603 except Exception:
604 logger.exception("Error processing paper preview")
605 # Continue with the next paper
607 # Sort by year (newer first) if available
608 previews = sorted(
609 previews,
610 key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
611 reverse=True,
612 )
614 logger.info(
615 f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"
616 )
617 return previews
619 def _get_full_content(
620 self, relevant_items: List[Dict[str, Any]]
621 ) -> List[Dict[str, Any]]:
622 """
623 Get full content for the relevant Semantic Scholar papers.
624 Gets additional details like citations, references, and full metadata.
626 Args:
627 relevant_items: List of relevant preview dictionaries
629 Returns:
630 List of result dictionaries with full content
631 """
632 # For Semantic Scholar, we already have most content from the preview
633 # Additional API calls are only needed for citations/references
635 logger.info(
636 f"Getting content for {len(relevant_items)} Semantic Scholar papers"
637 )
639 results = []
640 for item in relevant_items:
641 result = item.copy()
642 paper_id = item.get("_paper_id", "")
644 # Skip if no paper ID
645 if not paper_id:
646 results.append(result)
647 continue
649 # Get paper details if citations or references are requested
650 if self.get_citations or self.get_references or self.get_embeddings:
651 paper_details = self._get_paper_details(paper_id)
653 if paper_details: 653 ↛ 673line 653 didn't jump to line 673 because the condition on line 653 was always true
654 # Add citation information
655 if self.get_citations and "citations" in paper_details: 655 ↛ 659line 655 didn't jump to line 659 because the condition on line 655 was always true
656 result["citations"] = paper_details["citations"]
658 # Add reference information
659 if self.get_references and "references" in paper_details: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true
660 result["references"] = paper_details["references"]
662 # Add embedding if available
663 if self.get_embeddings and "embedding" in paper_details: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true
664 result["embedding"] = paper_details["embedding"]
666 # Add fields of study
667 if "fieldsOfStudy" in paper_details: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true
668 result["fields_of_study"] = paper_details[
669 "fieldsOfStudy"
670 ]
672 # Remove temporary fields
673 if "_paper_id" in result: 673 ↛ 675line 673 didn't jump to line 675 because the condition on line 673 was always true
674 del result["_paper_id"]
675 if "_search_strategy" in result: 675 ↛ 677line 675 didn't jump to line 677 because the condition on line 675 was always true
676 del result["_search_strategy"]
677 if "_full_paper" in result: 677 ↛ 680line 677 didn't jump to line 680 because the condition on line 677 was always true
678 del result["_full_paper"]
680 results.append(result)
682 return results