Coverage for src / local_deep_research / web_search_engines / engines / search_engine_semantic_scholar.py: 40%

240 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import requests 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7from requests.adapters import HTTPAdapter 

8from urllib3.util import Retry 

9 

10from ..rate_limiting import RateLimitError 

11from ..search_engine_base import BaseSearchEngine 

12from ...security import SafeSession 

13 

14 

15class SemanticScholarSearchEngine(BaseSearchEngine): 

16 """ 

17 Semantic Scholar search engine implementation with two-phase approach. 

18 Provides efficient access to scientific literature across all fields. 

19 """ 

20 

21 # Mark as public search engine 

22 is_public = True 

23 # Scientific/academic search engine 

24 is_scientific = True 

25 

26 def __init__( 

27 self, 

28 max_results: int = 10, 

29 api_key: Optional[str] = None, 

30 year_range: Optional[Tuple[int, int]] = None, 

31 get_abstracts: bool = True, 

32 get_references: bool = False, 

33 get_citations: bool = False, 

34 get_embeddings: bool = False, 

35 get_tldr: bool = True, 

36 citation_limit: int = 10, 

37 reference_limit: int = 10, 

38 llm: Optional[BaseLLM] = None, 

39 max_filtered_results: Optional[int] = None, 

40 optimize_queries: bool = True, 

41 max_retries: int = 5, 

42 retry_backoff_factor: float = 1.0, 

43 fields_of_study: Optional[List[str]] = None, 

44 publication_types: Optional[List[str]] = None, 

45 settings_snapshot: Optional[Dict[str, Any]] = None, 

46 **kwargs, 

47 ): 

48 """ 

49 Initialize the Semantic Scholar search engine. 

50 

51 Args: 

52 max_results: Maximum number of search results 

53 api_key: Semantic Scholar API key for higher rate limits (optional) 

54 year_range: Optional tuple of (start_year, end_year) to filter results 

55 get_abstracts: Whether to fetch abstracts for all results 

56 get_references: Whether to fetch references for papers 

57 get_citations: Whether to fetch citations for papers 

58 get_embeddings: Whether to fetch SPECTER embeddings for papers 

59 get_tldr: Whether to fetch TLDR summaries for papers 

60 citation_limit: Maximum number of citations to fetch per paper 

61 reference_limit: Maximum number of references to fetch per paper 

62 llm: Language model for relevance filtering 

63 max_filtered_results: Maximum number of results to keep after filtering 

64 optimize_queries: Whether to optimize natural language queries 

65 max_retries: Maximum number of retries for API requests 

66 retry_backoff_factor: Backoff factor for retries 

67 fields_of_study: List of fields of study to filter results 

68 publication_types: List of publication types to filter results 

69 settings_snapshot: Settings snapshot for configuration 

70 **kwargs: Additional parameters to pass to parent class 

71 """ 

72 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

73 super().__init__( 

74 llm=llm, 

75 max_filtered_results=max_filtered_results, 

76 max_results=max_results, 

77 settings_snapshot=settings_snapshot, 

78 **kwargs, 

79 ) 

80 

81 # Get API key from settings if not provided 

82 if not api_key and settings_snapshot: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 from ...config.search_config import get_setting_from_snapshot 

84 

85 try: 

86 api_key = get_setting_from_snapshot( 

87 "search.engine.web.semantic_scholar.api_key", 

88 settings_snapshot=settings_snapshot, 

89 ) 

90 except Exception: 

91 pass 

92 

93 self.api_key = api_key 

94 self.year_range = year_range 

95 self.get_abstracts = get_abstracts 

96 self.get_references = get_references 

97 self.get_citations = get_citations 

98 self.get_embeddings = get_embeddings 

99 self.get_tldr = get_tldr 

100 self.citation_limit = citation_limit 

101 self.reference_limit = reference_limit 

102 self.optimize_queries = optimize_queries 

103 self.max_retries = max_retries 

104 self.retry_backoff_factor = retry_backoff_factor 

105 self.fields_of_study = fields_of_study 

106 self.publication_types = publication_types 

107 

108 # Base API URLs 

109 self.base_url = "https://api.semanticscholar.org/graph/v1" 

110 self.paper_search_url = f"{self.base_url}/paper/search" 

111 self.paper_details_url = f"{self.base_url}/paper" 

112 

113 # Create a session with retry capabilities 

114 self.session = self._create_session() 

115 

116 # Log API key status 

117 if self.api_key: 

118 logger.info( 

119 "Using Semantic Scholar with API key (higher rate limits)" 

120 ) 

121 else: 

122 logger.info( 

123 "Using Semantic Scholar without API key (lower rate limits)" 

124 ) 

125 

126 def _create_session(self) -> SafeSession: 

127 """Create and configure a requests session with retry capabilities""" 

128 session = SafeSession() 

129 

130 # Configure automatic retries with exponential backoff 

131 retry_strategy = Retry( 

132 total=self.max_retries, 

133 backoff_factor=self.retry_backoff_factor, 

134 status_forcelist=[429, 500, 502, 503, 504], 

135 allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}, 

136 ) 

137 

138 adapter = HTTPAdapter(max_retries=retry_strategy) 

139 session.mount("https://", adapter) 

140 

141 # Set up headers 

142 headers = {"Accept": "application/json"} 

143 if self.api_key: 

144 headers["x-api-key"] = self.api_key 

145 

146 session.headers.update(headers) 

147 

148 return session 

149 

150 def _respect_rate_limit(self): 

151 """Apply rate limiting between requests""" 

152 # Apply rate limiting before request 

153 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

154 self.engine_type 

155 ) 

156 logger.debug(f"Applied rate limit wait: {self._last_wait_time:.2f}s") 

157 

158 def _make_request( 

159 self, 

160 url: str, 

161 params: Optional[Dict] = None, 

162 data: Optional[Dict] = None, 

163 method: str = "GET", 

164 ) -> Dict: 

165 """ 

166 Make a request to the Semantic Scholar API. 

167 

168 Args: 

169 url: API endpoint URL 

170 params: Query parameters 

171 data: JSON data for POST requests 

172 method: HTTP method (GET or POST) 

173 

174 Returns: 

175 API response as dictionary 

176 """ 

177 self._respect_rate_limit() 

178 

179 try: 

180 if method.upper() == "GET": 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true

181 response = self.session.get(url, params=params, timeout=30) 

182 elif method.upper() == "POST": 

183 response = self.session.post( 

184 url, params=params, json=data, timeout=30 

185 ) 

186 else: 

187 raise ValueError(f"Unsupported HTTP method: {method}") 

188 

189 # Handle rate limiting 

190 if response.status_code == 429: 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 logger.warning("Semantic Scholar rate limit exceeded") 

192 raise RateLimitError("Semantic Scholar rate limit exceeded") 

193 

194 response.raise_for_status() 

195 return response.json() 

196 except requests.RequestException: 

197 logger.exception("API request failed") 

198 return {} 

199 

200 def _optimize_query(self, query: str) -> str: 

201 """ 

202 Optimize a natural language query for Semantic Scholar search. 

203 If LLM is available, uses it to extract key terms and concepts. 

204 

205 Args: 

206 query: Natural language query 

207 

208 Returns: 

209 Optimized query string 

210 """ 

211 if not self.llm or not self.optimize_queries: 211 ↛ 214line 211 didn't jump to line 214 because the condition on line 211 was always true

212 return query 

213 

214 try: 

215 prompt = f"""Transform this natural language question into an optimized academic search query. 

216 

217Original query: "{query}" 

218 

219INSTRUCTIONS: 

2201. Extract key academic concepts, technical terms, and proper nouns 

2212. Remove generic words, filler words, and non-technical terms 

2223. Add quotation marks around specific phrases that should be kept together 

2234. Return ONLY the optimized search query with no explanation 

2245. Keep it under 100 characters if possible 

225 

226EXAMPLE TRANSFORMATIONS: 

227"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings" 

228"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction" 

229"Tell me about quantum computing approaches for encryption" → "quantum computing encryption" 

230 

231Return ONLY the optimized search query with no explanation. 

232""" 

233 

234 response = self.llm.invoke(prompt) 

235 optimized_query = response.content.strip() 

236 

237 # Clean up the query - remove any explanations 

238 lines = optimized_query.split("\n") 

239 optimized_query = lines[0].strip() 

240 

241 # Safety check - if query looks too much like an explanation, use original 

242 if len(optimized_query.split()) > 15 or ":" in optimized_query: 

243 logger.warning( 

244 "Query optimization result looks too verbose, using original" 

245 ) 

246 return query 

247 

248 logger.info(f"Original query: '{query}'") 

249 logger.info(f"Optimized for search: '{optimized_query}'") 

250 

251 return optimized_query 

252 except Exception: 

253 logger.exception("Error optimizing query") 

254 return query # Fall back to original query on error 

255 

256 def _direct_search(self, query: str) -> List[Dict[str, Any]]: 

257 """ 

258 Make a direct search request to the Semantic Scholar API. 

259 

260 Args: 

261 query: The search query 

262 

263 Returns: 

264 List of paper dictionaries 

265 """ 

266 try: 

267 # Configure fields to retrieve 

268 fields = [ 

269 "paperId", 

270 "externalIds", 

271 "url", 

272 "title", 

273 "abstract", 

274 "venue", 

275 "year", 

276 "authors", 

277 "citationCount", # Add citation count for ranking 

278 "openAccessPdf", # PDF URL for open access papers 

279 ] 

280 

281 if self.get_tldr: 281 ↛ 284line 281 didn't jump to line 284 because the condition on line 281 was always true

282 fields.append("tldr") 

283 

284 params = { 

285 "query": query, 

286 "limit": min( 

287 self.max_results, 100 

288 ), # API limit is 100 per request 

289 "fields": ",".join(fields), 

290 } 

291 

292 # Add year filter if specified 

293 if self.year_range: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 start_year, end_year = self.year_range 

295 params["year"] = f"{start_year}-{end_year}" 

296 

297 # Add fields of study filter if specified 

298 if self.fields_of_study: 298 ↛ 299line 298 didn't jump to line 299 because the condition on line 298 was never true

299 params["fieldsOfStudy"] = ",".join(self.fields_of_study) 

300 

301 # Add publication types filter if specified 

302 if self.publication_types: 302 ↛ 303line 302 didn't jump to line 303 because the condition on line 302 was never true

303 params["publicationTypes"] = ",".join(self.publication_types) 

304 

305 response = self._make_request(self.paper_search_url, params) 

306 

307 if "data" in response: 307 ↛ 314line 307 didn't jump to line 314 because the condition on line 307 was always true

308 papers = response["data"] 

309 logger.info( 

310 f"Found {len(papers)} papers with direct search for query: '{query}'" 

311 ) 

312 return papers 

313 else: 

314 logger.warning( 

315 f"No data in response for direct search query: '{query}'" 

316 ) 

317 return [] 

318 

319 except Exception: 

320 logger.exception("Error in direct search") 

321 return [] 

322 

323 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]: 

324 """ 

325 Perform an adaptive search that adjusts based on result volume. 

326 Uses LLM to generate better fallback queries when available. 

327 

328 Args: 

329 query: The search query 

330 

331 Returns: 

332 Tuple of (list of paper results, search strategy used) 

333 """ 

334 # Start with a standard search 

335 papers = self._direct_search(query) 

336 strategy = "standard" 

337 

338 # If no results, try different variations 

339 if not papers: 339 ↛ 341line 339 didn't jump to line 341 because the condition on line 339 was never true

340 # Try removing quotes to broaden search 

341 if '"' in query: 

342 unquoted_query = query.replace('"', "") 

343 logger.info( 

344 "No results with quoted terms, trying without quotes: %s", 

345 unquoted_query, 

346 ) 

347 papers = self._direct_search(unquoted_query) 

348 

349 if papers: 

350 strategy = "unquoted" 

351 return papers, strategy 

352 

353 # If LLM is available, use it to generate better fallback queries 

354 if self.llm: 

355 try: 

356 # Generate alternate search queries focusing on core concepts 

357 prompt = f"""You are helping refine a search query that returned no results. 

358 

359Original query: "{query}" 

360 

361The query might be too specific or use natural language phrasing that doesn't match academic paper keywords. 

362 

363Please provide THREE alternative search queries that: 

3641. Focus on the core academic concepts 

3652. Use precise terminology commonly found in academic papers 

3663. Break down complex queries into more searchable components 

3674. Format each as a concise keyword-focused search term (not a natural language question) 

368 

369Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused. 

370""" 

371 # Get the LLM's response 

372 response = self.llm.invoke(prompt) 

373 

374 # Extract the alternative queries 

375 alt_queries = [] 

376 if hasattr( 

377 response, "content" 

378 ): # Handle various LLM response formats 

379 content = response.content 

380 alt_queries = [ 

381 q.strip() 

382 for q in content.strip().split("\n") 

383 if q.strip() 

384 ] 

385 elif isinstance(response, str): 

386 alt_queries = [ 

387 q.strip() 

388 for q in response.strip().split("\n") 

389 if q.strip() 

390 ] 

391 

392 # Try each alternative query 

393 for alt_query in alt_queries[ 

394 :3 

395 ]: # Limit to first 3 alternatives 

396 logger.info("Trying LLM-suggested query: %s", alt_query) 

397 alt_papers = self._direct_search(alt_query) 

398 

399 if alt_papers: 

400 logger.info( 

401 "Found %s papers using LLM-suggested query: %s", 

402 len(alt_papers), 

403 alt_query, 

404 ) 

405 strategy = "llm_alternative" 

406 return alt_papers, strategy 

407 except Exception as e: 

408 logger.exception( 

409 "Error using LLM for query refinement: %s", e 

410 ) 

411 # Fall through to simpler strategies 

412 

413 # Fallback: Try with the longest words (likely specific terms) 

414 words = re.findall(r"\w+", query) 

415 longer_words = [word for word in words if len(word) > 6] 

416 if longer_words: 

417 # Use up to 3 of the longest words 

418 longer_words = sorted(longer_words, key=len, reverse=True)[:3] 

419 key_terms_query = " ".join(longer_words) 

420 logger.info("Trying with key terms: %s", key_terms_query) 

421 papers = self._direct_search(key_terms_query) 

422 

423 if papers: 

424 strategy = "key_terms" 

425 return papers, strategy 

426 

427 # Final fallback: Try with just the longest word 

428 if words: 

429 longest_word = max(words, key=len) 

430 if len(longest_word) > 5: # Only use if it's reasonably long 

431 logger.info("Trying with single key term: %s", longest_word) 

432 papers = self._direct_search(longest_word) 

433 

434 if papers: 

435 strategy = "single_term" 

436 return papers, strategy 

437 

438 return papers, strategy 

439 

440 def _get_paper_details(self, paper_id: str) -> Dict[str, Any]: 

441 """ 

442 Get detailed information about a specific paper. 

443 

444 Args: 

445 paper_id: Semantic Scholar Paper ID 

446 

447 Returns: 

448 Dictionary with paper details 

449 """ 

450 try: 

451 # Construct fields parameter 

452 fields = [ 

453 "paperId", 

454 "externalIds", 

455 "corpusId", 

456 "url", 

457 "title", 

458 "abstract", 

459 "venue", 

460 "year", 

461 "authors", 

462 "fieldsOfStudy", 

463 "citationCount", # Add citation count 

464 ] 

465 

466 if self.get_tldr: 

467 fields.append("tldr") 

468 

469 if self.get_embeddings: 

470 fields.append("embedding") 

471 

472 # Add citation and reference fields if requested 

473 if self.get_citations: 

474 fields.append(f"citations.limit({self.citation_limit})") 

475 

476 if self.get_references: 

477 fields.append(f"references.limit({self.reference_limit})") 

478 

479 # Make the request 

480 url = f"{self.paper_details_url}/{paper_id}" 

481 params = {"fields": ",".join(fields)} 

482 

483 return self._make_request(url, params) 

484 

485 except Exception: 

486 logger.exception("Error getting paper details for paper") 

487 return {} 

488 

489 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

490 """ 

491 Get preview information for Semantic Scholar papers. 

492 

493 Args: 

494 query: The search query 

495 

496 Returns: 

497 List of preview dictionaries 

498 """ 

499 logger.info(f"Getting Semantic Scholar previews for query: {query}") 

500 

501 # Optimize the query if LLM is available 

502 optimized_query = self._optimize_query(query) 

503 

504 # Use the adaptive search approach 

505 papers, strategy = self._adaptive_search(optimized_query) 

506 

507 if not papers: 507 ↛ 508line 507 didn't jump to line 508 because the condition on line 507 was never true

508 logger.warning("No Semantic Scholar results found") 

509 return [] 

510 

511 # Format as previews 

512 previews = [] 

513 for paper in papers: 

514 try: 

515 # Format authors - ensure we have a valid list with string values 

516 authors = [] 

517 if paper.get("authors"): 517 ↛ 525line 517 didn't jump to line 525 because the condition on line 517 was always true

518 authors = [ 

519 author.get("name", "") 

520 for author in paper["authors"] 

521 if author and author.get("name") 

522 ] 

523 

524 # Ensure we have valid strings for all fields 

525 paper_id = paper.get("paperId", "") 

526 title = paper.get("title", "") 

527 url = paper.get("url", "") 

528 

529 # Handle abstract safely, ensuring we always have a string 

530 abstract = paper.get("abstract") 

531 snippet = "" 

532 if abstract: 

533 snippet = ( 

534 abstract[:250] + "..." 

535 if len(abstract) > 250 

536 else abstract 

537 ) 

538 

539 venue = paper.get("venue", "") 

540 year = paper.get("year") 

541 external_ids = paper.get("externalIds", {}) 

542 

543 # Handle TLDR safely 

544 tldr_text = "" 

545 if paper.get("tldr") and isinstance(paper.get("tldr"), dict): 

546 tldr_text = paper.get("tldr", {}).get("text", "") 

547 

548 # Create preview with basic information, ensuring no None values 

549 preview = { 

550 "id": paper_id if paper_id else "", 

551 "title": title if title else "", 

552 "link": url if url else "", 

553 "snippet": snippet, 

554 "authors": authors, 

555 "venue": venue if venue else "", 

556 "year": year, 

557 "external_ids": external_ids if external_ids else {}, 

558 "source": "Semantic Scholar", 

559 "_paper_id": paper_id if paper_id else "", 

560 "_search_strategy": strategy, 

561 "tldr": tldr_text, 

562 } 

563 

564 # Store the full paper object for later reference 

565 preview["_full_paper"] = paper 

566 

567 previews.append(preview) 

568 except Exception: 

569 logger.exception("Error processing paper preview") 

570 # Continue with the next paper 

571 

572 # Sort by year (newer first) if available 

573 previews = sorted( 

574 previews, 

575 key=lambda p: p.get("year", 0) if p.get("year") is not None else 0, 

576 reverse=True, 

577 ) 

578 

579 logger.info( 

580 f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}" 

581 ) 

582 return previews 

583 

584 def _get_full_content( 

585 self, relevant_items: List[Dict[str, Any]] 

586 ) -> List[Dict[str, Any]]: 

587 """ 

588 Get full content for the relevant Semantic Scholar papers. 

589 Gets additional details like citations, references, and full metadata. 

590 

591 Args: 

592 relevant_items: List of relevant preview dictionaries 

593 

594 Returns: 

595 List of result dictionaries with full content 

596 """ 

597 # For Semantic Scholar, we already have most content from the preview 

598 # Additional API calls are only needed for citations/references 

599 

600 logger.info( 

601 f"Getting content for {len(relevant_items)} Semantic Scholar papers" 

602 ) 

603 

604 results = [] 

605 for item in relevant_items: 

606 result = item.copy() 

607 paper_id = item.get("_paper_id", "") 

608 

609 # Skip if no paper ID 

610 if not paper_id: 

611 results.append(result) 

612 continue 

613 

614 # Get paper details if citations or references are requested 

615 if self.get_citations or self.get_references or self.get_embeddings: 

616 paper_details = self._get_paper_details(paper_id) 

617 

618 if paper_details: 

619 # Add citation information 

620 if self.get_citations and "citations" in paper_details: 

621 result["citations"] = paper_details["citations"] 

622 

623 # Add reference information 

624 if self.get_references and "references" in paper_details: 

625 result["references"] = paper_details["references"] 

626 

627 # Add embedding if available 

628 if self.get_embeddings and "embedding" in paper_details: 

629 result["embedding"] = paper_details["embedding"] 

630 

631 # Add fields of study 

632 if "fieldsOfStudy" in paper_details: 

633 result["fields_of_study"] = paper_details[ 

634 "fieldsOfStudy" 

635 ] 

636 

637 # Remove temporary fields 

638 if "_paper_id" in result: 

639 del result["_paper_id"] 

640 if "_search_strategy" in result: 

641 del result["_search_strategy"] 

642 if "_full_paper" in result: 

643 del result["_full_paper"] 

644 

645 results.append(result) 

646 

647 return results