Coverage for src/local_deep_research/web_search_engines/engines/search_engine_semantic

1import re

2from typing import Any, Dict, List, Optional, Tuple

4import requests

5from langchain_core.language_models import BaseLLM

6from loguru import logger

7from requests.adapters import HTTPAdapter

8from urllib3.util import Retry

10from ...constants import SNIPPET_LENGTH_SHORT

11from ..rate_limiting import RateLimitError

12from ..search_engine_base import BaseSearchEngine

13from ...security import SafeSession

16class SemanticScholarSearchEngine(BaseSearchEngine):

17 """

18 Semantic Scholar search engine implementation with two-phase approach.

19 Provides efficient access to scientific literature across all fields.

20 """

22 # Mark as public search engine

23 is_public = True

24 # Scientific/academic search engine

25 is_scientific = True

27 def __init__(

28 self,

29 max_results: int = 10,

30 api_key: Optional[str] = None,

31 year_range: Optional[Tuple[int, int]] = None,

32 get_abstracts: bool = True,

33 get_references: bool = False,

34 get_citations: bool = False,

35 get_embeddings: bool = False,

36 get_tldr: bool = True,

37 citation_limit: int = 10,

38 reference_limit: int = 10,

39 llm: Optional[BaseLLM] = None,

40 max_filtered_results: Optional[int] = None,

41 optimize_queries: bool = True,

42 max_retries: int = 5,

43 retry_backoff_factor: float = 1.0,

44 fields_of_study: Optional[List[str]] = None,

45 publication_types: Optional[List[str]] = None,

46 settings_snapshot: Optional[Dict[str, Any]] = None,

47 **kwargs,

48 ):

49 """

50 Initialize the Semantic Scholar search engine.

52 Args:

53 max_results: Maximum number of search results

54 api_key: Semantic Scholar API key for higher rate limits (optional)

55 year_range: Optional tuple of (start_year, end_year) to filter results

56 get_abstracts: Whether to fetch abstracts for all results

57 get_references: Whether to fetch references for papers

58 get_citations: Whether to fetch citations for papers

59 get_embeddings: Whether to fetch SPECTER embeddings for papers

60 get_tldr: Whether to fetch TLDR summaries for papers

61 citation_limit: Maximum number of citations to fetch per paper

62 reference_limit: Maximum number of references to fetch per paper

63 llm: Language model for relevance filtering

64 max_filtered_results: Maximum number of results to keep after filtering

65 optimize_queries: Whether to optimize natural language queries

66 max_retries: Maximum number of retries for API requests

67 retry_backoff_factor: Backoff factor for retries

68 fields_of_study: List of fields of study to filter results

69 publication_types: List of publication types to filter results

70 settings_snapshot: Settings snapshot for configuration

71 **kwargs: Additional parameters to pass to parent class

72 """

73 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

74 super().__init__(

75 llm=llm,

76 max_filtered_results=max_filtered_results,

77 max_results=max_results,

78 settings_snapshot=settings_snapshot,

79 **kwargs,

80 )

82 # Get API key from settings if not provided

83 if not api_key and settings_snapshot:

84 from ...config.search_config import get_setting_from_snapshot

86 try:

87 api_key = get_setting_from_snapshot(

88 "search.engine.web.semantic_scholar.api_key",

89 settings_snapshot=settings_snapshot,

90 )

91 except Exception:

92 pass

94 self.api_key = api_key

95 self.year_range = year_range

96 self.get_abstracts = get_abstracts

97 self.get_references = get_references

98 self.get_citations = get_citations

99 self.get_embeddings = get_embeddings

100 self.get_tldr = get_tldr

101 self.citation_limit = citation_limit

102 self.reference_limit = reference_limit

103 self.optimize_queries = optimize_queries

104 self.max_retries = max_retries

105 self.retry_backoff_factor = retry_backoff_factor

106 self.fields_of_study = (

107 self._ensure_list(fields_of_study)

108 if fields_of_study is not None

109 else None

110 )

111 self.publication_types = (

112 self._ensure_list(publication_types)

113 if publication_types is not None

114 else None

115 )

116

117 # Base API URLs

118 self.base_url = "https://api.semanticscholar.org/graph/v1"

119 self.paper_search_url = f"{self.base_url}/paper/search"

120 self.paper_details_url = f"{self.base_url}/paper"

121

122 # Create a session with retry capabilities

123 self.session = self._create_session()

124

125 # Log API key status

126 if self.api_key:

127 logger.info(

128 "Using Semantic Scholar with API key (higher rate limits)"

129 )

130 else:

131 logger.info(

132 "Using Semantic Scholar without API key (lower rate limits)"

133 )

134

135 def _create_session(self) -> SafeSession:

136 """Create and configure a requests session with retry capabilities"""

137 session = SafeSession()

138

139 # Configure automatic retries with exponential backoff

140 retry_strategy = Retry(

141 total=self.max_retries,

142 backoff_factor=self.retry_backoff_factor,

143 status_forcelist=[429, 500, 502, 503, 504],

144 allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},

145 )

146

147 adapter = HTTPAdapter(max_retries=retry_strategy)

148 session.mount("https://", adapter)

149

150 # Set up headers

151 headers = {"Accept": "application/json"}

152 if self.api_key:

153 headers["x-api-key"] = self.api_key

154

155 session.headers.update(headers)

156

157 return session

158

159 def close(self):

160 """

161 Close the HTTP session and clean up resources.

162

163 Call this method when done using the search engine to prevent

164 connection/file descriptor leaks.

165 """

166 if hasattr(self, "session") and self.session: 166 ↛ exitline 166 didn't return from function 'close' because the condition on line 166 was always true

167 try:

168 self.session.close()

169 except Exception:

170 logger.exception("Error closing SemanticScholar session")

171 finally:

172 self.session = None

173

174 def __del__(self):

175 """Destructor to ensure session is closed."""

176 self.close()

177

178 def __enter__(self):

179 """Context manager entry."""

180 return self

181

182 def __exit__(self, exc_type, exc_val, exc_tb):

183 """Context manager exit - ensures session cleanup."""

184 self.close()

185 return False

186

187 def _respect_rate_limit(self):

188 """Apply rate limiting between requests"""

189 # Apply rate limiting before request

190 self._last_wait_time = self.rate_tracker.apply_rate_limit(

191 self.engine_type

192 )

193 logger.debug(f"Applied rate limit wait: {self._last_wait_time:.2f}s")

194

195 def _make_request(

196 self,

197 url: str,

198 params: Optional[Dict] = None,

199 data: Optional[Dict] = None,

200 method: str = "GET",

201 ) -> Dict:

202 """

203 Make a request to the Semantic Scholar API.

204

205 Args:

206 url: API endpoint URL

207 params: Query parameters

208 data: JSON data for POST requests

209 method: HTTP method (GET or POST)

210

211 Returns:

212 API response as dictionary

213 """

214 self._respect_rate_limit()

215

216 try:

217 if method.upper() == "GET":

218 response = self.session.get(url, params=params, timeout=30)

219 elif method.upper() == "POST":

220 response = self.session.post(

221 url, params=params, json=data, timeout=30

222 )

223 else:

224 raise ValueError(f"Unsupported HTTP method: {method}")

225

226 # Handle rate limiting

227 if response.status_code == 429:

228 logger.warning("Semantic Scholar rate limit exceeded")

229 raise RateLimitError("Semantic Scholar rate limit exceeded")

230

231 response.raise_for_status()

232 return response.json()

233 except requests.RequestException:

234 logger.exception("API request failed")

235 return {}

236

237 def _optimize_query(self, query: str) -> str:

238 """

239 Optimize a natural language query for Semantic Scholar search.

240 If LLM is available, uses it to extract key terms and concepts.

241

242 Args:

243 query: Natural language query

244

245 Returns:

246 Optimized query string

247 """

248 if not self.llm or not self.optimize_queries:

249 return query

250

251 try:

252 prompt = f"""Transform this natural language question into an optimized academic search query.

253

254Original query: "{query}"

255

256INSTRUCTIONS:

2571. Extract key academic concepts, technical terms, and proper nouns

2582. Remove generic words, filler words, and non-technical terms

2593. Add quotation marks around specific phrases that should be kept together

2604. Return ONLY the optimized search query with no explanation

2615. Keep it under 100 characters if possible

262

263EXAMPLE TRANSFORMATIONS:

264"What are the latest findings about mRNA vaccines and COVID-19?" → "mRNA vaccines COVID-19 recent findings"

265"How does machine learning impact climate change prediction?" → "machine learning "climate change" prediction"

266"Tell me about quantum computing approaches for encryption" → "quantum computing encryption"

267

268Return ONLY the optimized search query with no explanation.

269"""

270

271 response = self.llm.invoke(prompt)

272 optimized_query = response.content.strip()

273

274 # Clean up the query - remove any explanations

275 lines = optimized_query.split("\n")

276 optimized_query = lines[0].strip()

277

278 # Safety check - if query looks too much like an explanation, use original

279 if len(optimized_query.split()) > 15 or ":" in optimized_query: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 logger.warning(

281 "Query optimization result looks too verbose, using original"

282 )

283 return query

284

285 logger.info(f"Original query: '{query}'")

286 logger.info(f"Optimized for search: '{optimized_query}'")

287

288 return optimized_query

289 except Exception:

290 logger.exception("Error optimizing query")

291 return query # Fall back to original query on error

292

293 def _direct_search(self, query: str) -> List[Dict[str, Any]]:

294 """

295 Make a direct search request to the Semantic Scholar API.

296

297 Args:

298 query: The search query

299

300 Returns:

301 List of paper dictionaries

302 """

303 try:

304 # Configure fields to retrieve

305 fields = [

306 "paperId",

307 "externalIds",

308 "url",

309 "title",

310 "abstract",

311 "venue",

312 "year",

313 "authors",

314 "citationCount", # Add citation count for ranking

315 "openAccessPdf", # PDF URL for open access papers

316 ]

317

318 if self.get_tldr: 318 ↛ 321line 318 didn't jump to line 321 because the condition on line 318 was always true

319 fields.append("tldr")

320

321 params = {

322 "query": query,

323 "limit": min(

324 self.max_results, 100

325 ), # API limit is 100 per request

326 "fields": ",".join(fields),

327 }

328

329 # Add year filter if specified

330 if self.year_range:

331 start_year, end_year = self.year_range

332 params["year"] = f"{start_year}-{end_year}"

333

334 # Add fields of study filter if specified

335 if self.fields_of_study: 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true

336 params["fieldsOfStudy"] = ",".join(self.fields_of_study)

337

338 # Add publication types filter if specified

339 if self.publication_types: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 params["publicationTypes"] = ",".join(self.publication_types)

341

342 response = self._make_request(self.paper_search_url, params)

343

344 if "data" in response:

345 papers = response["data"]

346 logger.info(

347 f"Found {len(papers)} papers with direct search for query: '{query}'"

348 )

349 return papers

350 else:

351 logger.warning(

352 f"No data in response for direct search query: '{query}'"

353 )

354 return []

355

356 except Exception:

357 logger.exception("Error in direct search")

358 return []

359

360 def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:

361 """

362 Perform an adaptive search that adjusts based on result volume.

363 Uses LLM to generate better fallback queries when available.

364

365 Args:

366 query: The search query

367

368 Returns:

369 Tuple of (list of paper results, search strategy used)

370 """

371 # Start with a standard search

372 papers = self._direct_search(query)

373 strategy = "standard"

374

375 # If no results, try different variations

376 if not papers:

377 # Try removing quotes to broaden search

378 if '"' in query: 378 ↛ 391line 378 didn't jump to line 391 because the condition on line 378 was always true

379 unquoted_query = query.replace('"', "")

380 logger.info(

381 "No results with quoted terms, trying without quotes: %s",

382 unquoted_query,

383 )

384 papers = self._direct_search(unquoted_query)

385

386 if papers: 386 ↛ 391line 386 didn't jump to line 391 because the condition on line 386 was always true

387 strategy = "unquoted"

388 return papers, strategy

389

390 # If LLM is available, use it to generate better fallback queries

391 if self.llm:

392 try:

393 # Generate alternate search queries focusing on core concepts

394 prompt = f"""You are helping refine a search query that returned no results.

395

396Original query: "{query}"

397

398The query might be too specific or use natural language phrasing that doesn't match academic paper keywords.

399

400Please provide THREE alternative search queries that:

4011. Focus on the core academic concepts

4022. Use precise terminology commonly found in academic papers

4033. Break down complex queries into more searchable components

4044. Format each as a concise keyword-focused search term (not a natural language question)

405

406Format each query on a new line with no numbering or explanation. Keep each query under 8 words and very focused.

407"""

408 # Get the LLM's response

409 response = self.llm.invoke(prompt)

410

411 # Extract the alternative queries

412 alt_queries = []

413 if hasattr(

414 response, "content"

415 ): # Handle various LLM response formats

416 content = response.content

417 alt_queries = [

418 q.strip()

419 for q in content.strip().split("\n")

420 if q.strip()

421 ]

422 elif isinstance(response, str):

423 alt_queries = [

424 q.strip()

425 for q in response.strip().split("\n")

426 if q.strip()

427 ]

428

429 # Try each alternative query

430 for alt_query in alt_queries[

431 :3

432 ]: # Limit to first 3 alternatives

433 logger.info("Trying LLM-suggested query: %s", alt_query)

434 alt_papers = self._direct_search(alt_query)

435

436 if alt_papers:

437 logger.info(

438 "Found %s papers using LLM-suggested query: %s",

439 len(alt_papers),

440 alt_query,

441 )

442 strategy = "llm_alternative"

443 return alt_papers, strategy

444 except Exception:

445 logger.exception("Error using LLM for query refinement")

446 # Fall through to simpler strategies

447

448 # Fallback: Try with the longest words (likely specific terms)

449 words = re.findall(r"\w+", query)

450 longer_words = [word for word in words if len(word) > 6]

451 if longer_words:

452 # Use up to 3 of the longest words

453 longer_words = sorted(longer_words, key=len, reverse=True)[:3]

454 key_terms_query = " ".join(longer_words)

455 logger.info("Trying with key terms: %s", key_terms_query)

456 papers = self._direct_search(key_terms_query)

457

458 if papers:

459 strategy = "key_terms"

460 return papers, strategy

461

462 # Final fallback: Try with just the longest word

463 if words:

464 longest_word = max(words, key=len)

465 if len(longest_word) > 5: # Only use if it's reasonably long

466 logger.info("Trying with single key term: %s", longest_word)

467 papers = self._direct_search(longest_word)

468

469 if papers:

470 strategy = "single_term"

471 return papers, strategy

472

473 return papers, strategy

474

475 def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:

476 """

477 Get detailed information about a specific paper.

478

479 Args:

480 paper_id: Semantic Scholar Paper ID

481

482 Returns:

483 Dictionary with paper details

484 """

485 try:

486 # Construct fields parameter

487 fields = [

488 "paperId",

489 "externalIds",

490 "corpusId",

491 "url",

492 "title",

493 "abstract",

494 "venue",

495 "year",

496 "authors",

497 "fieldsOfStudy",

498 "citationCount", # Add citation count

499 ]

500

501 if self.get_tldr:

502 fields.append("tldr")

503

504 if self.get_embeddings:

505 fields.append("embedding")

506

507 # Add citation and reference fields if requested

508 if self.get_citations:

509 fields.append(f"citations.limit({self.citation_limit})")

510

511 if self.get_references:

512 fields.append(f"references.limit({self.reference_limit})")

513

514 # Make the request

515 url = f"{self.paper_details_url}/{paper_id}"

516 params = {"fields": ",".join(fields)}

517

518 return self._make_request(url, params)

519

520 except Exception:

521 logger.exception("Error getting paper details for paper")

522 return {}

523

524 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

525 """

526 Get preview information for Semantic Scholar papers.

527

528 Args:

529 query: The search query

530

531 Returns:

532 List of preview dictionaries

533 """

534 logger.info(f"Getting Semantic Scholar previews for query: {query}")

535

536 # Optimize the query if LLM is available

537 optimized_query = self._optimize_query(query)

538

539 # Use the adaptive search approach

540 papers, strategy = self._adaptive_search(optimized_query)

541

542 if not papers:

543 logger.warning("No Semantic Scholar results found")

544 return []

545

546 # Format as previews

547 previews = []

548 for paper in papers:

549 try:

550 # Format authors - ensure we have a valid list with string values

551 authors = []

552 if paper.get("authors"):

553 authors = [

554 author.get("name", "")

555 for author in paper["authors"]

556 if author and author.get("name")

557 ]

558

559 # Ensure we have valid strings for all fields

560 paper_id = paper.get("paperId", "")

561 title = paper.get("title", "")

562 url = paper.get("url", "")

563

564 # Handle abstract safely, ensuring we always have a string

565 abstract = paper.get("abstract")

566 snippet = ""

567 if abstract:

568 snippet = (

569 abstract[:SNIPPET_LENGTH_SHORT] + "..."

570 if len(abstract) > SNIPPET_LENGTH_SHORT

571 else abstract

572 )

573

574 venue = paper.get("venue", "")

575 year = paper.get("year")

576 external_ids = paper.get("externalIds", {})

577

578 # Handle TLDR safely

579 tldr_text = ""

580 if paper.get("tldr") and isinstance(paper.get("tldr"), dict):

581 tldr_text = paper.get("tldr", {}).get("text", "")

582

583 # Create preview with basic information, ensuring no None values

584 preview = {

585 "id": paper_id if paper_id else "",

586 "title": title if title else "",

587 "link": url if url else "",

588 "snippet": snippet,

589 "authors": authors,

590 "venue": venue if venue else "",

591 "year": year,

592 "external_ids": external_ids if external_ids else {},

593 "source": "Semantic Scholar",

594 "_paper_id": paper_id if paper_id else "",

595 "_search_strategy": strategy,

596 "tldr": tldr_text,

597 }

598

599 # Store the full paper object for later reference

600 preview["_full_paper"] = paper

601

602 previews.append(preview)

603 except Exception:

604 logger.exception("Error processing paper preview")

605 # Continue with the next paper

606

607 # Sort by year (newer first) if available

608 previews = sorted(

609 previews,

610 key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,

611 reverse=True,

612 )

613

614 logger.info(

615 f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"

616 )

617 return previews

618

619 def _get_full_content(

620 self, relevant_items: List[Dict[str, Any]]

621 ) -> List[Dict[str, Any]]:

622 """

623 Get full content for the relevant Semantic Scholar papers.

624 Gets additional details like citations, references, and full metadata.

625

626 Args:

627 relevant_items: List of relevant preview dictionaries

628

629 Returns:

630 List of result dictionaries with full content

631 """

632 # For Semantic Scholar, we already have most content from the preview

633 # Additional API calls are only needed for citations/references

634

635 logger.info(

636 f"Getting content for {len(relevant_items)} Semantic Scholar papers"

637 )

638

639 results = []

640 for item in relevant_items:

641 result = item.copy()

642 paper_id = item.get("_paper_id", "")

643

644 # Skip if no paper ID

645 if not paper_id:

646 results.append(result)

647 continue

648

649 # Get paper details if citations or references are requested

650 if self.get_citations or self.get_references or self.get_embeddings:

651 paper_details = self._get_paper_details(paper_id)

652

653 if paper_details: 653 ↛ 673line 653 didn't jump to line 673 because the condition on line 653 was always true

654 # Add citation information

655 if self.get_citations and "citations" in paper_details: 655 ↛ 659line 655 didn't jump to line 659 because the condition on line 655 was always true

656 result["citations"] = paper_details["citations"]

657

658 # Add reference information

659 if self.get_references and "references" in paper_details: 659 ↛ 660line 659 didn't jump to line 660 because the condition on line 659 was never true

660 result["references"] = paper_details["references"]

661

662 # Add embedding if available

663 if self.get_embeddings and "embedding" in paper_details: 663 ↛ 664line 663 didn't jump to line 664 because the condition on line 663 was never true

664 result["embedding"] = paper_details["embedding"]

665

666 # Add fields of study

667 if "fieldsOfStudy" in paper_details: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true

668 result["fields_of_study"] = paper_details[

669 "fieldsOfStudy"

670 ]

671

672 # Remove temporary fields

673 if "_paper_id" in result: 673 ↛ 675line 673 didn't jump to line 675 because the condition on line 673 was always true

674 del result["_paper_id"]

675 if "_search_strategy" in result: 675 ↛ 677line 675 didn't jump to line 677 because the condition on line 675 was always true

676 del result["_search_strategy"]

677 if "_full_paper" in result: 677 ↛ 680line 677 didn't jump to line 680 because the condition on line 677 was always true

678 del result["_full_paper"]

679

680 results.append(result)

681

682 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_semantic_scholar.py: 66%

255 statements