Coverage for src / local_deep_research / web_search_engines / engines / search_engine_semantic_scholar.py: 66%

255 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import requests 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7from requests.adapters import HTTPAdapter 

8from urllib3.util import Retry 

9 

10from ...constants import SNIPPET_LENGTH_SHORT 

11from ..rate_limiting import RateLimitError 

12from ..search_engine_base import BaseSearchEngine 

13from ...security import SafeSession 

14 

15 

16class SemanticScholarSearchEngine(BaseSearchEngine): 

17 """ 

18 Semantic Scholar search engine implementation with two-phase approach. 

19 Provides efficient access to scientific literature across all fields. 

20 """ 

21 

22 # Mark as public search engine 

23 is_public = True 

24 # Scientific/academic search engine 

25 is_scientific = True 

26 

27 def __init__( 

28 self, 

29 max_results: int = 10, 

30 api_key: Optional[str] = None, 

31 year_range: Optional[Tuple[int, int]] = None, 

32 get_abstracts: bool = True, 

33 get_references: bool = False, 

34 get_citations: bool = False, 

35 get_embeddings: bool = False, 

36 get_tldr: bool = True, 

37 citation_limit: int = 10, 

38 reference_limit: int = 10, 

39 llm: Optional[BaseLLM] = None, 

40 max_filtered_results: Optional[int] = None, 

41 optimize_queries: bool = True, 

42 max_retries: int = 5, 

43 retry_backoff_factor: float = 1.0, 

44 fields_of_study: Optional[List[str]] = None, 

45 publication_types: Optional[List[str]] = None, 

46 settings_snapshot: Optional[Dict[str, Any]] = None, 

47 **kwargs, 

48 ): 

49 """ 

50 Initialize the Semantic Scholar search engine. 

51 

52 Args: 

53 max_results: Maximum number of search results 

54 api_key: Semantic Scholar API key for higher rate limits (optional) 

55 year_range: Optional tuple of (start_year, end_year) to filter results 

56 get_abstracts: Whether to fetch abstracts for all results 

57 get_references: Whether to fetch references for papers 

58 get_citations: Whether to fetch citations for papers 

59 get_embeddings: Whether to fetch SPECTER embeddings for papers 

60 get_tldr: Whether to fetch TLDR summaries for papers 

61 citation_limit: Maximum number of citations to fetch per paper 

62 reference_limit: Maximum number of references to fetch per paper 

63 llm: Language model for relevance filtering 

64 max_filtered_results: Maximum number of results to keep after filtering 

65 optimize_queries: Whether to optimize natural language queries 

66 max_retries: Maximum number of retries for API requests 

67 retry_backoff_factor: Backoff factor for retries 

68 fields_of_study: List of fields of study to filter results 

69 publication_types: List of publication types to filter results 

70 settings_snapshot: Settings snapshot for configuration 

71 **kwargs: Additional parameters to pass to parent class 

72 """ 

73 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

74 super().__init__( 

75 llm=llm, 

76 max_filtered_results=max_filtered_results, 

77 max_results=max_results, 

78 settings_snapshot=settings_snapshot, 

79 **kwargs, 

80 ) 

81 

82 # Get API key from settings if not provided 

83 if not api_key and settings_snapshot: 

84 from ...config.search_config import get_setting_from_snapshot 

85 

86 try: 

87 api_key = get_setting_from_snapshot( 

88 "search.engine.web.semantic_scholar.api_key", 

89 settings_snapshot=settings_snapshot, 

90 ) 

91 except Exception: 

92 pass 

93 

94 self.api_key = api_key 

95 self.year_range = year_range 

96 self.get_abstracts = get_abstracts 

97 self.get_references = get_references 

98 self.get_citations = get_citations 

99 self.get_embeddings = get_embeddings 

100 self.get_tldr = get_tldr 

101 self.citation_limit = citation_limit 

102 self.reference_limit = reference_limit 

103 self.optimize_queries = optimize_queries 

104 self.max_retries = max_retries 

105 self.retry_backoff_factor = retry_backoff_factor 

106 self.fields_of_study = ( 

107 self._ensure_list(fields_of_study) 

108 if fields_of_study is not None 

109 else None 

110 ) 

111 self.publication_types = ( 

112 self._ensure_list(publication_types) 

113 if publication_types is not None 

114 else None 

115 ) 

116 

117 # Base API URLs 

118 self.base_url = "https://api.semanticscholar.org/graph/v1" 

119 self.paper_search_url = f"{self.base_url}/paper/search" 

120 self.paper_details_url = f"{self.base_url}/paper" 

121 

122 # Create a session with retry capabilities 

123 self.session = self._create_session() 

124 

125 # Log API key status 

126 if self.api_key: 

127 logger.info( 

128 "Using Semantic Scholar with API key (higher rate limits)" 

129 ) 

130 else: 

131 logger.info( 

132 "Using Semantic Scholar without API key (lower rate limits)" 

133 ) 

134 

135 def _create_session(self) -> SafeSession: 

136 """Create and configure a requests session with retry capabilities""" 

137 session = SafeSession() 

138 

139 # Configure automatic retries with exponential backoff 

140 retry_strategy = Retry( 

141 total=self.max_retries, 

142 backoff_factor=self.retry_backoff_factor, 

143 status_forcelist=[429, 500, 502, 503, 504], 

144 allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}, 

145 ) 

146 

147 adapter = HTTPAdapter(max_retries=retry_strategy) 

148 session.mount("https://", adapter) 

149 

150 # Set up headers 

151 headers = {"Accept": "application/json"} 

152 if self.api_key: 

153 headers["x-api-key"] = self.api_key 

154 

155 session.headers.update(headers) 

156 

157 return session 

158 

159 def close(self): 

160 """ 

161 Close the HTTP session and clean up resources. 

162 

163 Call this method when done using the search engine to prevent 

164 connection/file descriptor leaks. 

165 """ 

166 if hasattr(self, "session") and self.session: 166 ↛ exitline 166 didn't return from function 'close' because the condition on line 166 was always true

167 try: 

168 self.session.close() 

169 except Exception: 

170 logger.exception("Error closing SemanticScholar session") 

171 finally: 

172 self.session = None 

173 

174 def __del__(self): 

175 """Destructor to ensure session is closed.""" 

176 self.close() 

177 

178 def __enter__(self): 

179 """Context manager entry.""" 

180 return self 

181 

182 def __exit__(self, exc_type, exc_val, exc_tb): 

183 """Context manager exit - ensures session cleanup.""" 

184 self.close() 

185 return False 

186 

187 def _respect_rate_limit(self): 

188 """Apply rate limiting between requests""" 

189 # Apply rate limiting before request 

190 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

191 self.engine_type 

192 ) 

193 logger.debug(f"Applied rate limit wait: {self._last_wait_time:.2f}s") 

194 

195 def _make_request( 

196 self, 

197 url: str, 

198 params: Optional[Dict] = None, 

199 data: Optional[Dict] = None, 

200 method: str = "GET", 

201 ) -> Dict: 

202 """ 

203 Make a request to the Semantic Scholar API. 

204 

205 Args: 

206 url: API endpoint URL 

207 params: Query parameters 

208 data: JSON data for POST requests 

209 method: HTTP method (GET or POST) 

210 

211 Returns: 

212 API response as dictionary 

213 """ 

214 self._respect_rate_limit() 

215 

216 try: 

217 if method.upper() == "GET": 

218 response = self.session.get(url, params=params, timeout=30) 

219 elif method.upper() == "POST": 

220 response = self.session.post( 

221 url, params=params, json=data, timeout=30 

222 ) 

223 else: 

224 raise ValueError(f"Unsupported HTTP method: {method}") 

225 

226 # Handle rate limiting 

227 if response.status_code == 429: 

228 logger.warning("Semantic Scholar rate limit exceeded") 

229 raise RateLimitError("Semantic Scholar rate limit exceeded") 

230 

231 response.raise_for_status() 

232 return response.json() 

233 except requests.RequestException: 

234 logger.exception("API request failed") 

235 return {} 

236 

237 def _optimize_query(self, query: str) -> str: 

238 """ 

239 Optimize a natural language query for Semantic Scholar search. 

240 If LLM is available, uses it to extract key terms and concepts. 

241 

242 Args: 

243 query: Natural language query 

244 

245 Returns: 

246 Optimized query string 

247 """ 

248 if not self.llm or not self.optimize_queries: 

249 return query 

250 

251 try: 

252 prompt = f"""Transform this natural language question into an optimized academic search query. 

253 

254Original query: "{query}" 

255 

256INSTRUCTIONS: 

2571. Extract key academic concepts, technical terms, and proper nouns 

2582. Remove generic words, filler words, and non-technical terms 

2593. Add quotation marks around specific phrases that should be kept together 

2604. Return ONLY the optimized search query with no explanation 

2615. Keep it under 100 characters if possible 

262 

263EXAMPLE TRANSFORMATIONS: 

264"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings" 

265"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction" 

266"Tell me about quantum computing approaches for encryption" → "quantum computing encryption" 

267 

268Return ONLY the optimized search query with no explanation. 

269""" 

270 

271 response = self.llm.invoke(prompt) 

272 optimized_query = response.content.strip() 

273 

274 # Clean up the query - remove any explanations 

275 lines = optimized_query.split("\n") 

276 optimized_query = lines[0].strip() 

277 

278 # Safety check - if query looks too much like an explanation, use original 

279 if len(optimized_query.split()) > 15 or ":" in optimized_query: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 logger.warning( 

281 "Query optimization result looks too verbose, using original" 

282 ) 

283 return query 

284 

285 logger.info(f"Original query: '{query}'") 

286 logger.info(f"Optimized for search: '{optimized_query}'") 

287 

288 return optimized_query 

289 except Exception: 

290 logger.exception("Error optimizing query") 

291 return query # Fall back to original query on error 

292 

293 def _direct_search(self, query: str) -> List[Dict[str, Any]]: 

294 """ 

295 Make a direct search request to the Semantic Scholar API. 

296 

297 Args: 

298 query: The search query 

299 

300 Returns: 

301 List of paper dictionaries 

302 """ 

303 try: 

304 # Configure fields to retrieve 

305 fields = [ 

306 "paperId", 

307 "externalIds", 

308 "url", 

309 "title", 

310 "abstract", 

311 "venue", 

312 "year", 

313 "authors", 

314 "citationCount", # Add citation count for ranking 

315 "openAccessPdf", # PDF URL for open access papers 

316 ] 

317 

318 if self.get_tldr: 318 ↛ 321line 318 didn't jump to line 321 because the condition on line 318 was always true

319 fields.append("tldr") 

320 

321 params = { 

322 "query": query, 

323 "limit": min( 

324 self.max_results, 100 

325 ), # API limit is 100 per request 

326 "fields": ",".join(fields), 

327 } 

328 

329 # Add year filter if specified 

330 if self.year_range: 

331 start_year, end_year = self.year_range 

332 params["year"] = f"{start_year}-{end_year}" 

333 

334 # Add fields of study filter if specified 

335 if self.fields_of_study: 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true

336 params["fieldsOfStudy"] = ",".join(self.fields_of_study) 

337 

338 # Add publication types filter if specified 

339 if self.publication_types: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 params["publicationTypes"] = ",".join(self.publication_types) 

341 

342 response = self._make_request(self.paper_search_url, params) 

343 

344 if "data" in response: 

345 papers = response["data"] 

346 logger.info( 

347 f"Found {len(papers)} papers with direct search for query: '{query}'" 

348 ) 

349 return papers 

350 else: 

351 logger.warning( 

352 f"No data in response for direct search query: '{query}'" 

353 ) 

354 return [] 

355 

356 except Exception: 

357 logger.exception("Error in direct search") 

358 return [] 

359 

360 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]: 

361 """ 

362 Perform an adaptive search that adjusts based on result volume. 

363 Uses LLM to generate better fallback queries when available. 

364 

365 Args: 

366 query: The search query 

367 

368 Returns: 

369 Tuple of (list of paper results, search strategy used) 

370 """ 

371 # Start with a standard search 

372 papers = self._direct_search(query) 

373 strategy = "standard" 

374 

375 # If no results, try different variations 

376 if not papers: 

377 # Try removing quotes to broaden search 

378 if '"' in query: 378 ↛ 391line 378 didn't jump to line 391 because the condition on line 378 was always true

379 unquoted_query = query.replace('"', "") 

380 logger.info( 

381 "No results with quoted terms, trying without quotes: %s", 

382 unquoted_query, 

383 ) 

384 papers = self._direct_search(unquoted_query) 

385 

386 if papers: 386 ↛ 391line 386 didn't jump to line 391 because the condition on line 386 was always true

387 strategy = "unquoted" 

388 return papers, strategy 

389 

390 # If LLM is available, use it to generate better fallback queries 

391 if self.llm: 

392 try: 

393 # Generate alternate search queries focusing on core concepts 

394 prompt = f"""You are helping refine a search query that returned no results. 

395 

396Original query: "{query}" 

397 

398The query might be too specific or use natural language phrasing that doesn't match academic paper keywords. 

399 

400Please provide THREE alternative search queries that: 

4011. Focus on the core academic concepts 

4022. Use precise terminology commonly found in academic papers 

4033. Break down complex queries into more searchable components 

4044. Format each as a concise keyword-focused search term (not a natural language question) 

405 

406Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused. 

407""" 

408 # Get the LLM's response 

409 response = self.llm.invoke(prompt) 

410 

411 # Extract the alternative queries 

412 alt_queries = [] 

413 if hasattr( 

414 response, "content" 

415 ): # Handle various LLM response formats 

416 content = response.content 

417 alt_queries = [ 

418 q.strip() 

419 for q in content.strip().split("\n") 

420 if q.strip() 

421 ] 

422 elif isinstance(response, str): 

423 alt_queries = [ 

424 q.strip() 

425 for q in response.strip().split("\n") 

426 if q.strip() 

427 ] 

428 

429 # Try each alternative query 

430 for alt_query in alt_queries[ 

431 :3 

432 ]: # Limit to first 3 alternatives 

433 logger.info("Trying LLM-suggested query: %s", alt_query) 

434 alt_papers = self._direct_search(alt_query) 

435 

436 if alt_papers: 

437 logger.info( 

438 "Found %s papers using LLM-suggested query: %s", 

439 len(alt_papers), 

440 alt_query, 

441 ) 

442 strategy = "llm_alternative" 

443 return alt_papers, strategy 

444 except Exception: 

445 logger.exception("Error using LLM for query refinement") 

446 # Fall through to simpler strategies 

447 

448 # Fallback: Try with the longest words (likely specific terms) 

449 words = re.findall(r"\w+", query) 

450 longer_words = [word for word in words if len(word) > 6] 

451 if longer_words: 

452 # Use up to 3 of the longest words 

453 longer_words = sorted(longer_words, key=len, reverse=True)[:3] 

454 key_terms_query = " ".join(longer_words) 

455 logger.info("Trying with key terms: %s", key_terms_query) 

456 papers = self._direct_search(key_terms_query) 

457 

458 if papers: 

459 strategy = "key_terms" 

460 return papers, strategy 

461 

462 # Final fallback: Try with just the longest word 

463 if words: 

464 longest_word = max(words, key=len) 

465 if len(longest_word) > 5: # Only use if it's reasonably long 

466 logger.info("Trying with single key term: %s", longest_word) 

467 papers = self._direct_search(longest_word) 

468 

469 if papers: 

470 strategy = "single_term" 

471 return papers, strategy 

472 

473 return papers, strategy 

474 

475 def _get_paper_details(self, paper_id: str) -> Dict[str, Any]: 

476 """ 

477 Get detailed information about a specific paper. 

478 

479 Args: 

480 paper_id: Semantic Scholar Paper ID 

481 

482 Returns: 

483 Dictionary with paper details 

484 """ 

485 try: 

486 # Construct fields parameter 

487 fields = [ 

488 "paperId", 

489 "externalIds", 

490 "corpusId", 

491 "url", 

492 "title", 

493 "abstract", 

494 "venue", 

495 "year", 

496 "authors", 

497 "fieldsOfStudy", 

498 "citationCount", # Add citation count 

499 ] 

500 

501 if self.get_tldr: 

502 fields.append("tldr") 

503 

504 if self.get_embeddings: 

505 fields.append("embedding") 

506 

507 # Add citation and reference fields if requested 

508 if self.get_citations: 

509 fields.append(f"citations.limit({self.citation_limit})") 

510 

511 if self.get_references: 

512 fields.append(f"references.limit({self.reference_limit})") 

513 

514 # Make the request 

515 url = f"{self.paper_details_url}/{paper_id}" 

516 params = {"fields": ",".join(fields)} 

517 

518 return self._make_request(url, params) 

519 

520 except Exception: 

521 logger.exception("Error getting paper details for paper") 

522 return {} 

523 

524 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

525 """ 

526 Get preview information for Semantic Scholar papers. 

527 

528 Args: 

529 query: The search query 

530 

531 Returns: 

532 List of preview dictionaries 

533 """ 

534 logger.info(f"Getting Semantic Scholar previews for query: {query}") 

535 

536 # Optimize the query if LLM is available 

537 optimized_query = self._optimize_query(query) 

538 

539 # Use the adaptive search approach 

540 papers, strategy = self._adaptive_search(optimized_query) 

541 

542 if not papers: 

543 logger.warning("No Semantic Scholar results found") 

544 return [] 

545 

546 # Format as previews 

547 previews = [] 

548 for paper in papers: 

549 try: 

550 # Format authors - ensure we have a valid list with string values 

551 authors = [] 

552 if paper.get("authors"): 

553 authors = [ 

554 author.get("name", "") 

555 for author in paper["authors"] 

556 if author and author.get("name") 

557 ] 

558 

559 # Ensure we have valid strings for all fields 

560 paper_id = paper.get("paperId", "") 

561 title = paper.get("title", "") 

562 url = paper.get("url", "") 

563 

564 # Handle abstract safely, ensuring we always have a string 

565 abstract = paper.get("abstract") 

566 snippet = "" 

567 if abstract: 

568 snippet = ( 

569 abstract[:SNIPPET_LENGTH_SHORT] + "..." 

570 if len(abstract) > SNIPPET_LENGTH_SHORT 

571 else abstract 

572 ) 

573 

574 venue = paper.get("venue", "") 

575 year = paper.get("year") 

576 external_ids = paper.get("externalIds", {}) 

577 

578 # Handle TLDR safely 

579 tldr_text = "" 

580 if paper.get("tldr") and isinstance(paper.get("tldr"), dict): 

581 tldr_text = paper.get("tldr", {}).get("text", "") 

582 

583 # Create preview with basic information, ensuring no None values 

584 preview = { 

585 "id": paper_id if paper_id else "", 

586 "title": title if title else "", 

587 "link": url if url else "", 

588 "snippet": snippet, 

589 "authors": authors, 

590 "venue": venue if venue else "", 

591 "year": year, 

592 "external_ids": external_ids if external_ids else {}, 

593 "source": "Semantic Scholar", 

594 "_paper_id": paper_id if paper_id else "", 

595 "_search_strategy": strategy, 

596 "tldr": tldr_text, 

597 } 

598 

599 # Store the full paper object for later reference 

600 preview["_full_paper"] = paper 

601 

602 previews.append(preview) 

603 except Exception: 

604 logger.exception("Error processing paper preview") 

605 # Continue with the next paper 

606 

607 # Sort by year (newer first) if available 

608 previews = sorted( 

609 previews, 

610 key=lambda p: p.get("year", 0) if p.get("year") is not None else 0, 

611 reverse=True, 

612 ) 

613 

614 logger.info( 

615 f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}" 

616 ) 

617 return previews 

618 

619 def _get_full_content( 

620 self, relevant_items: List[Dict[str, Any]] 

621 ) -> List[Dict[str, Any]]: 

622 """ 

623 Get full content for the relevant Semantic Scholar papers. 

624 Gets additional details like citations, references, and full metadata. 

625 

626 Args: 

627 relevant_items: List of relevant preview dictionaries 

628 

629 Returns: 

630 List of result dictionaries with full content 

631 """ 

632 # For Semantic Scholar, we already have most content from the preview 

633 # Additional API calls are only needed for citations/references 

634 

635 logger.info( 

636 f"Getting content for {len(relevant_items)} Semantic Scholar papers" 

637 ) 

638 

639 results = [] 

640 for item in relevant_items: 

641 result = item.copy() 

642 paper_id = item.get("_paper_id", "") 

643 

644 # Skip if no paper ID 

645 if not paper_id: 

646 results.append(result) 

647 continue 

648 

649 # Get paper details if citations or references are requested 

650 if self.get_citations or self.get_references or self.get_embeddings: 

651 paper_details = self._get_paper_details(paper_id) 

652 

653 if paper_details: 653 ↛ 673line 653 didn't jump to line 673 because the condition on line 653 was always true

654 # Add citation information 

655 if self.get_citations and "citations" in paper_details: 655 ↛ 659line 655 didn't jump to line 659 because the condition on line 655 was always true

656 result["citations"] = paper_details["citations"] 

657 

658 # Add reference information 

659 if self.get_references and "references" in paper_details: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true

660 result["references"] = paper_details["references"] 

661 

662 # Add embedding if available 

663 if self.get_embeddings and "embedding" in paper_details: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true

664 result["embedding"] = paper_details["embedding"] 

665 

666 # Add fields of study 

667 if "fieldsOfStudy" in paper_details: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true

668 result["fields_of_study"] = paper_details[ 

669 "fieldsOfStudy" 

670 ] 

671 

672 # Remove temporary fields 

673 if "_paper_id" in result: 673 ↛ 675line 673 didn't jump to line 675 because the condition on line 673 was always true

674 del result["_paper_id"] 

675 if "_search_strategy" in result: 675 ↛ 677line 675 didn't jump to line 677 because the condition on line 675 was always true

676 del result["_search_strategy"] 

677 if "_full_paper" in result: 677 ↛ 680line 677 didn't jump to line 680 because the condition on line 677 was always true

678 del result["_full_paper"] 

679 

680 results.append(result) 

681 

682 return results