Coverage for src / local_deep_research / web_search_engines / engines / search_engine_semantic_scholar.py: 40%
240 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4import requests
5from langchain_core.language_models import BaseLLM
6from loguru import logger
7from requests.adapters import HTTPAdapter
8from urllib3.util import Retry
10from ..rate_limiting import RateLimitError
11from ..search_engine_base import BaseSearchEngine
12from ...security import SafeSession
15class SemanticScholarSearchEngine(BaseSearchEngine):
16 """
17 Semantic Scholar search engine implementation with two-phase approach.
18 Provides efficient access to scientific literature across all fields.
19 """
21 # Mark as public search engine
22 is_public = True
23 # Scientific/academic search engine
24 is_scientific = True
26 def __init__(
27 self,
28 max_results: int = 10,
29 api_key: Optional[str] = None,
30 year_range: Optional[Tuple[int, int]] = None,
31 get_abstracts: bool = True,
32 get_references: bool = False,
33 get_citations: bool = False,
34 get_embeddings: bool = False,
35 get_tldr: bool = True,
36 citation_limit: int = 10,
37 reference_limit: int = 10,
38 llm: Optional[BaseLLM] = None,
39 max_filtered_results: Optional[int] = None,
40 optimize_queries: bool = True,
41 max_retries: int = 5,
42 retry_backoff_factor: float = 1.0,
43 fields_of_study: Optional[List[str]] = None,
44 publication_types: Optional[List[str]] = None,
45 settings_snapshot: Optional[Dict[str, Any]] = None,
46 **kwargs,
47 ):
48 """
49 Initialize the Semantic Scholar search engine.
51 Args:
52 max_results: Maximum number of search results
53 api_key: Semantic Scholar API key for higher rate limits (optional)
54 year_range: Optional tuple of (start_year, end_year) to filter results
55 get_abstracts: Whether to fetch abstracts for all results
56 get_references: Whether to fetch references for papers
57 get_citations: Whether to fetch citations for papers
58 get_embeddings: Whether to fetch SPECTER embeddings for papers
59 get_tldr: Whether to fetch TLDR summaries for papers
60 citation_limit: Maximum number of citations to fetch per paper
61 reference_limit: Maximum number of references to fetch per paper
62 llm: Language model for relevance filtering
63 max_filtered_results: Maximum number of results to keep after filtering
64 optimize_queries: Whether to optimize natural language queries
65 max_retries: Maximum number of retries for API requests
66 retry_backoff_factor: Backoff factor for retries
67 fields_of_study: List of fields of study to filter results
68 publication_types: List of publication types to filter results
69 settings_snapshot: Settings snapshot for configuration
70 **kwargs: Additional parameters to pass to parent class
71 """
72 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
73 super().__init__(
74 llm=llm,
75 max_filtered_results=max_filtered_results,
76 max_results=max_results,
77 settings_snapshot=settings_snapshot,
78 **kwargs,
79 )
81 # Get API key from settings if not provided
82 if not api_key and settings_snapshot: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 from ...config.search_config import get_setting_from_snapshot
85 try:
86 api_key = get_setting_from_snapshot(
87 "search.engine.web.semantic_scholar.api_key",
88 settings_snapshot=settings_snapshot,
89 )
90 except Exception:
91 pass
93 self.api_key = api_key
94 self.year_range = year_range
95 self.get_abstracts = get_abstracts
96 self.get_references = get_references
97 self.get_citations = get_citations
98 self.get_embeddings = get_embeddings
99 self.get_tldr = get_tldr
100 self.citation_limit = citation_limit
101 self.reference_limit = reference_limit
102 self.optimize_queries = optimize_queries
103 self.max_retries = max_retries
104 self.retry_backoff_factor = retry_backoff_factor
105 self.fields_of_study = fields_of_study
106 self.publication_types = publication_types
108 # Base API URLs
109 self.base_url = "https://api.semanticscholar.org/graph/v1"
110 self.paper_search_url = f"{self.base_url}/paper/search"
111 self.paper_details_url = f"{self.base_url}/paper"
113 # Create a session with retry capabilities
114 self.session = self._create_session()
116 # Log API key status
117 if self.api_key:
118 logger.info(
119 "Using Semantic Scholar with API key (higher rate limits)"
120 )
121 else:
122 logger.info(
123 "Using Semantic Scholar without API key (lower rate limits)"
124 )
126 def _create_session(self) -> SafeSession:
127 """Create and configure a requests session with retry capabilities"""
128 session = SafeSession()
130 # Configure automatic retries with exponential backoff
131 retry_strategy = Retry(
132 total=self.max_retries,
133 backoff_factor=self.retry_backoff_factor,
134 status_forcelist=[429, 500, 502, 503, 504],
135 allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},
136 )
138 adapter = HTTPAdapter(max_retries=retry_strategy)
139 session.mount("https://", adapter)
141 # Set up headers
142 headers = {"Accept": "application/json"}
143 if self.api_key:
144 headers["x-api-key"] = self.api_key
146 session.headers.update(headers)
148 return session
150 def _respect_rate_limit(self):
151 """Apply rate limiting between requests"""
152 # Apply rate limiting before request
153 self._last_wait_time = self.rate_tracker.apply_rate_limit(
154 self.engine_type
155 )
156 logger.debug(f"Applied rate limit wait: {self._last_wait_time:.2f}s")
158 def _make_request(
159 self,
160 url: str,
161 params: Optional[Dict] = None,
162 data: Optional[Dict] = None,
163 method: str = "GET",
164 ) -> Dict:
165 """
166 Make a request to the Semantic Scholar API.
168 Args:
169 url: API endpoint URL
170 params: Query parameters
171 data: JSON data for POST requests
172 method: HTTP method (GET or POST)
174 Returns:
175 API response as dictionary
176 """
177 self._respect_rate_limit()
179 try:
180 if method.upper() == "GET": 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true
181 response = self.session.get(url, params=params, timeout=30)
182 elif method.upper() == "POST":
183 response = self.session.post(
184 url, params=params, json=data, timeout=30
185 )
186 else:
187 raise ValueError(f"Unsupported HTTP method: {method}")
189 # Handle rate limiting
190 if response.status_code == 429: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 logger.warning("Semantic Scholar rate limit exceeded")
192 raise RateLimitError("Semantic Scholar rate limit exceeded")
194 response.raise_for_status()
195 return response.json()
196 except requests.RequestException:
197 logger.exception("API request failed")
198 return {}
200 def _optimize_query(self, query: str) -> str:
201 """
202 Optimize a natural language query for Semantic Scholar search.
203 If LLM is available, uses it to extract key terms and concepts.
205 Args:
206 query: Natural language query
208 Returns:
209 Optimized query string
210 """
211 if not self.llm or not self.optimize_queries: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true
212 return query
214 try:
215 prompt = f"""Transform this natural language question into an optimized academic search query.
217Original query: "{query}"
219INSTRUCTIONS:
2201. Extract key academic concepts, technical terms, and proper nouns
2212. Remove generic words, filler words, and non-technical terms
2223. Add quotation marks around specific phrases that should be kept together
2234. Return ONLY the optimized search query with no explanation
2245. Keep it under 100 characters if possible
226EXAMPLE TRANSFORMATIONS:
227"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"
228"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"
229"Tell me about quantum computing approaches for encryption" → "quantum computing encryption"
231Return ONLY the optimized search query with no explanation.
232"""
234 response = self.llm.invoke(prompt)
235 optimized_query = response.content.strip()
237 # Clean up the query - remove any explanations
238 lines = optimized_query.split("\n")
239 optimized_query = lines[0].strip()
241 # Safety check - if query looks too much like an explanation, use original
242 if len(optimized_query.split()) > 15 or ":" in optimized_query:
243 logger.warning(
244 "Query optimization result looks too verbose, using original"
245 )
246 return query
248 logger.info(f"Original query: '{query}'")
249 logger.info(f"Optimized for search: '{optimized_query}'")
251 return optimized_query
252 except Exception:
253 logger.exception("Error optimizing query")
254 return query # Fall back to original query on error
256 def _direct_search(self, query: str) -> List[Dict[str, Any]]:
257 """
258 Make a direct search request to the Semantic Scholar API.
260 Args:
261 query: The search query
263 Returns:
264 List of paper dictionaries
265 """
266 try:
267 # Configure fields to retrieve
268 fields = [
269 "paperId",
270 "externalIds",
271 "url",
272 "title",
273 "abstract",
274 "venue",
275 "year",
276 "authors",
277 "citationCount", # Add citation count for ranking
278 "openAccessPdf", # PDF URL for open access papers
279 ]
281 if self.get_tldr: 281 ↛ 284line 281 didn't jump to line 284 because the condition on line 281 was always true
282 fields.append("tldr")
284 params = {
285 "query": query,
286 "limit": min(
287 self.max_results, 100
288 ), # API limit is 100 per request
289 "fields": ",".join(fields),
290 }
292 # Add year filter if specified
293 if self.year_range: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 start_year, end_year = self.year_range
295 params["year"] = f"{start_year}-{end_year}"
297 # Add fields of study filter if specified
298 if self.fields_of_study: 298 ↛ 299line 298 didn't jump to line 299 because the condition on line 298 was never true
299 params["fieldsOfStudy"] = ",".join(self.fields_of_study)
301 # Add publication types filter if specified
302 if self.publication_types: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true
303 params["publicationTypes"] = ",".join(self.publication_types)
305 response = self._make_request(self.paper_search_url, params)
307 if "data" in response: 307 ↛ 314line 307 didn't jump to line 314 because the condition on line 307 was always true
308 papers = response["data"]
309 logger.info(
310 f"Found {len(papers)} papers with direct search for query: '{query}'"
311 )
312 return papers
313 else:
314 logger.warning(
315 f"No data in response for direct search query: '{query}'"
316 )
317 return []
319 except Exception:
320 logger.exception("Error in direct search")
321 return []
323 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
324 """
325 Perform an adaptive search that adjusts based on result volume.
326 Uses LLM to generate better fallback queries when available.
328 Args:
329 query: The search query
331 Returns:
332 Tuple of (list of paper results, search strategy used)
333 """
334 # Start with a standard search
335 papers = self._direct_search(query)
336 strategy = "standard"
338 # If no results, try different variations
339 if not papers: 339 ↛ 341line 339 didn't jump to line 341 because the condition on line 339 was never true
340 # Try removing quotes to broaden search
341 if '"' in query:
342 unquoted_query = query.replace('"', "")
343 logger.info(
344 "No results with quoted terms, trying without quotes: %s",
345 unquoted_query,
346 )
347 papers = self._direct_search(unquoted_query)
349 if papers:
350 strategy = "unquoted"
351 return papers, strategy
353 # If LLM is available, use it to generate better fallback queries
354 if self.llm:
355 try:
356 # Generate alternate search queries focusing on core concepts
357 prompt = f"""You are helping refine a search query that returned no results.
359Original query: "{query}"
361The query might be too specific or use natural language phrasing that doesn't match academic paper keywords.
363Please provide THREE alternative search queries that:
3641. Focus on the core academic concepts
3652. Use precise terminology commonly found in academic papers
3663. Break down complex queries into more searchable components
3674. Format each as a concise keyword-focused search term (not a natural language question)
369Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.
370"""
371 # Get the LLM's response
372 response = self.llm.invoke(prompt)
374 # Extract the alternative queries
375 alt_queries = []
376 if hasattr(
377 response, "content"
378 ): # Handle various LLM response formats
379 content = response.content
380 alt_queries = [
381 q.strip()
382 for q in content.strip().split("\n")
383 if q.strip()
384 ]
385 elif isinstance(response, str):
386 alt_queries = [
387 q.strip()
388 for q in response.strip().split("\n")
389 if q.strip()
390 ]
392 # Try each alternative query
393 for alt_query in alt_queries[
394 :3
395 ]: # Limit to first 3 alternatives
396 logger.info("Trying LLM-suggested query: %s", alt_query)
397 alt_papers = self._direct_search(alt_query)
399 if alt_papers:
400 logger.info(
401 "Found %s papers using LLM-suggested query: %s",
402 len(alt_papers),
403 alt_query,
404 )
405 strategy = "llm_alternative"
406 return alt_papers, strategy
407 except Exception as e:
408 logger.exception(
409 "Error using LLM for query refinement: %s", e
410 )
411 # Fall through to simpler strategies
413 # Fallback: Try with the longest words (likely specific terms)
414 words = re.findall(r"\w+", query)
415 longer_words = [word for word in words if len(word) > 6]
416 if longer_words:
417 # Use up to 3 of the longest words
418 longer_words = sorted(longer_words, key=len, reverse=True)[:3]
419 key_terms_query = " ".join(longer_words)
420 logger.info("Trying with key terms: %s", key_terms_query)
421 papers = self._direct_search(key_terms_query)
423 if papers:
424 strategy = "key_terms"
425 return papers, strategy
427 # Final fallback: Try with just the longest word
428 if words:
429 longest_word = max(words, key=len)
430 if len(longest_word) > 5: # Only use if it's reasonably long
431 logger.info("Trying with single key term: %s", longest_word)
432 papers = self._direct_search(longest_word)
434 if papers:
435 strategy = "single_term"
436 return papers, strategy
438 return papers, strategy
440 def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
441 """
442 Get detailed information about a specific paper.
444 Args:
445 paper_id: Semantic Scholar Paper ID
447 Returns:
448 Dictionary with paper details
449 """
450 try:
451 # Construct fields parameter
452 fields = [
453 "paperId",
454 "externalIds",
455 "corpusId",
456 "url",
457 "title",
458 "abstract",
459 "venue",
460 "year",
461 "authors",
462 "fieldsOfStudy",
463 "citationCount", # Add citation count
464 ]
466 if self.get_tldr:
467 fields.append("tldr")
469 if self.get_embeddings:
470 fields.append("embedding")
472 # Add citation and reference fields if requested
473 if self.get_citations:
474 fields.append(f"citations.limit({self.citation_limit})")
476 if self.get_references:
477 fields.append(f"references.limit({self.reference_limit})")
479 # Make the request
480 url = f"{self.paper_details_url}/{paper_id}"
481 params = {"fields": ",".join(fields)}
483 return self._make_request(url, params)
485 except Exception:
486 logger.exception("Error getting paper details for paper")
487 return {}
489 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
490 """
491 Get preview information for Semantic Scholar papers.
493 Args:
494 query: The search query
496 Returns:
497 List of preview dictionaries
498 """
499 logger.info(f"Getting Semantic Scholar previews for query: {query}")
501 # Optimize the query if LLM is available
502 optimized_query = self._optimize_query(query)
504 # Use the adaptive search approach
505 papers, strategy = self._adaptive_search(optimized_query)
507 if not papers: 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true
508 logger.warning("No Semantic Scholar results found")
509 return []
511 # Format as previews
512 previews = []
513 for paper in papers:
514 try:
515 # Format authors - ensure we have a valid list with string values
516 authors = []
517 if paper.get("authors"): 517 ↛ 525line 517 didn't jump to line 525 because the condition on line 517 was always true
518 authors = [
519 author.get("name", "")
520 for author in paper["authors"]
521 if author and author.get("name")
522 ]
524 # Ensure we have valid strings for all fields
525 paper_id = paper.get("paperId", "")
526 title = paper.get("title", "")
527 url = paper.get("url", "")
529 # Handle abstract safely, ensuring we always have a string
530 abstract = paper.get("abstract")
531 snippet = ""
532 if abstract:
533 snippet = (
534 abstract[:250] + "..."
535 if len(abstract) > 250
536 else abstract
537 )
539 venue = paper.get("venue", "")
540 year = paper.get("year")
541 external_ids = paper.get("externalIds", {})
543 # Handle TLDR safely
544 tldr_text = ""
545 if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
546 tldr_text = paper.get("tldr", {}).get("text", "")
548 # Create preview with basic information, ensuring no None values
549 preview = {
550 "id": paper_id if paper_id else "",
551 "title": title if title else "",
552 "link": url if url else "",
553 "snippet": snippet,
554 "authors": authors,
555 "venue": venue if venue else "",
556 "year": year,
557 "external_ids": external_ids if external_ids else {},
558 "source": "Semantic Scholar",
559 "_paper_id": paper_id if paper_id else "",
560 "_search_strategy": strategy,
561 "tldr": tldr_text,
562 }
564 # Store the full paper object for later reference
565 preview["_full_paper"] = paper
567 previews.append(preview)
568 except Exception:
569 logger.exception("Error processing paper preview")
570 # Continue with the next paper
572 # Sort by year (newer first) if available
573 previews = sorted(
574 previews,
575 key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
576 reverse=True,
577 )
579 logger.info(
580 f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"
581 )
582 return previews
584 def _get_full_content(
585 self, relevant_items: List[Dict[str, Any]]
586 ) -> List[Dict[str, Any]]:
587 """
588 Get full content for the relevant Semantic Scholar papers.
589 Gets additional details like citations, references, and full metadata.
591 Args:
592 relevant_items: List of relevant preview dictionaries
594 Returns:
595 List of result dictionaries with full content
596 """
597 # For Semantic Scholar, we already have most content from the preview
598 # Additional API calls are only needed for citations/references
600 logger.info(
601 f"Getting content for {len(relevant_items)} Semantic Scholar papers"
602 )
604 results = []
605 for item in relevant_items:
606 result = item.copy()
607 paper_id = item.get("_paper_id", "")
609 # Skip if no paper ID
610 if not paper_id:
611 results.append(result)
612 continue
614 # Get paper details if citations or references are requested
615 if self.get_citations or self.get_references or self.get_embeddings:
616 paper_details = self._get_paper_details(paper_id)
618 if paper_details:
619 # Add citation information
620 if self.get_citations and "citations" in paper_details:
621 result["citations"] = paper_details["citations"]
623 # Add reference information
624 if self.get_references and "references" in paper_details:
625 result["references"] = paper_details["references"]
627 # Add embedding if available
628 if self.get_embeddings and "embedding" in paper_details:
629 result["embedding"] = paper_details["embedding"]
631 # Add fields of study
632 if "fieldsOfStudy" in paper_details:
633 result["fields_of_study"] = paper_details[
634 "fieldsOfStudy"
635 ]
637 # Remove temporary fields
638 if "_paper_id" in result:
639 del result["_paper_id"]
640 if "_search_strategy" in result:
641 del result["_search_strategy"]
642 if "_full_paper" in result:
643 del result["_full_paper"]
645 results.append(result)
647 return results