Coverage for src / local_deep_research / research_library / downloaders / semantic_scholar.py: 95%
71 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Semantic Scholar PDF Downloader
4Downloads PDFs from Semantic Scholar using their API to find open access PDFs.
5"""
7import re
8from typing import Optional
9from urllib.parse import urlparse
11import requests
12from loguru import logger
14from .base import BaseDownloader, ContentType, DownloadResult
17class SemanticScholarDownloader(BaseDownloader):
18 """Downloader for Semantic Scholar papers with open access PDF support."""
20 def __init__(self, timeout: int = 30, api_key: Optional[str] = None):
21 """
22 Initialize Semantic Scholar downloader.
24 Args:
25 timeout: Request timeout in seconds
26 api_key: Optional Semantic Scholar API key for higher rate limits
27 """
28 super().__init__(timeout)
29 self.api_key = api_key
30 self.base_api_url = "https://api.semanticscholar.org/graph/v1"
32 def can_handle(self, url: str) -> bool:
33 """Check if URL is from Semantic Scholar."""
34 try:
35 hostname = urlparse(url).hostname
36 return bool(
37 hostname
38 and (
39 hostname == "semanticscholar.org"
40 or hostname.endswith(".semanticscholar.org")
41 )
42 )
43 except (ValueError, AttributeError, TypeError):
44 return False
46 def download(
47 self, url: str, content_type: ContentType = ContentType.PDF
48 ) -> Optional[bytes]:
49 """Download content from Semantic Scholar."""
50 result = self.download_with_result(url, content_type)
51 return result.content if result.is_success else None
53 def download_with_result(
54 self, url: str, content_type: ContentType = ContentType.PDF
55 ) -> DownloadResult:
56 """Download PDF and return detailed result with skip reason."""
57 # Only support PDF downloads for now
58 if content_type != ContentType.PDF:
59 return DownloadResult(
60 skip_reason="Text extraction not yet supported for Semantic Scholar"
61 )
63 # Extract paper ID from URL
64 paper_id = self._extract_paper_id(url)
65 if not paper_id:
66 return DownloadResult(
67 skip_reason="Invalid Semantic Scholar URL - could not extract paper ID"
68 )
70 logger.info(f"Looking up Semantic Scholar paper: {paper_id}")
72 # Get paper details from API to find PDF URL
73 pdf_url = self._get_pdf_url(paper_id)
75 if not pdf_url: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 return DownloadResult(
77 skip_reason="Not open access - subscription required"
78 )
80 # Download the PDF from the open access URL
81 logger.info(f"Downloading open access PDF from: {pdf_url}")
82 pdf_content = super()._download_pdf(pdf_url)
84 if pdf_content: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 return DownloadResult(content=pdf_content, is_success=True)
86 return DownloadResult(
87 skip_reason="Open access PDF URL found but download failed"
88 )
90 def _extract_paper_id(self, url: str) -> Optional[str]:
91 """
92 Extract Semantic Scholar paper ID from URL.
94 Handles formats like:
95 - https://www.semanticscholar.org/paper/abc123...
96 - https://www.semanticscholar.org/paper/Title-Here/abc123...
98 Returns:
99 Paper ID (hash) or None if not found
100 """
101 # Use urlparse for more robust URL handling (handles query strings, fragments)
102 parsed = urlparse(url)
103 if not parsed.netloc or "semanticscholar.org" not in parsed.netloc:
104 return None
106 # Extract paper ID from path (40 character hex hash)
107 # Handles /paper/{hash} or /paper/{title}/{hash}
108 path = parsed.path
109 match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path)
110 return match.group(1) if match else None
112 def _get_pdf_url(self, paper_id: str) -> Optional[str]:
113 """
114 Get open access PDF URL from Semantic Scholar API.
116 Args:
117 paper_id: Semantic Scholar paper ID (hash)
119 Returns:
120 PDF URL if available, None otherwise
121 """
122 try:
123 # Construct API request
124 api_url = f"{self.base_api_url}/paper/{paper_id}"
125 params = {"fields": "openAccessPdf"}
127 # Add API key header if available
128 headers = {}
129 if self.api_key:
130 headers["x-api-key"] = self.api_key
132 # Make API request
133 response = self.session.get(
134 api_url, params=params, headers=headers, timeout=self.timeout
135 )
137 if response.status_code == 200:
138 data = response.json()
140 # Extract PDF URL from openAccessPdf field
141 open_access_pdf = data.get("openAccessPdf")
142 if open_access_pdf and isinstance(open_access_pdf, dict):
143 pdf_url = open_access_pdf.get("url")
144 if pdf_url: 144 ↛ 150line 144 didn't jump to line 150 because the condition on line 144 was always true
145 logger.info(
146 f"Found open access PDF for paper {paper_id}: {pdf_url}"
147 )
148 return str(pdf_url)
150 logger.info(
151 f"No open access PDF available for paper {paper_id}"
152 )
153 return None
155 if response.status_code == 404:
156 logger.warning(
157 f"Paper not found in Semantic Scholar: {paper_id}"
158 )
159 return None
160 logger.warning(
161 f"Semantic Scholar API error: {response.status_code}"
162 )
163 return None
165 except requests.exceptions.RequestException:
166 logger.exception("Failed to query Semantic Scholar API")
167 return None
168 except ValueError:
169 # JSON decode errors are expected runtime errors
170 logger.exception("Failed to parse Semantic Scholar API response")
171 return None
172 # Note: KeyError and TypeError are not caught - they indicate programming
173 # bugs that should propagate for debugging