Coverage for src / local_deep_research / research_library / downloaders / semantic_scholar.py: 67%
71 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Semantic Scholar PDF Downloader
4Downloads PDFs from Semantic Scholar using their API to find open access PDFs.
5"""
7import re
8from typing import Optional
9from urllib.parse import urlparse
11import requests
12from loguru import logger
14from .base import BaseDownloader, ContentType, DownloadResult
17class SemanticScholarDownloader(BaseDownloader):
18 """Downloader for Semantic Scholar papers with open access PDF support."""
20 def __init__(self, timeout: int = 30, api_key: Optional[str] = None):
21 """
22 Initialize Semantic Scholar downloader.
24 Args:
25 timeout: Request timeout in seconds
26 api_key: Optional Semantic Scholar API key for higher rate limits
27 """
28 super().__init__(timeout)
29 self.api_key = api_key
30 self.base_api_url = "https://api.semanticscholar.org/graph/v1"
32 def can_handle(self, url: str) -> bool:
33 """Check if URL is from Semantic Scholar."""
34 try:
35 hostname = urlparse(url).hostname
36 return bool(
37 hostname
38 and (
39 hostname == "semanticscholar.org"
40 or hostname.endswith(".semanticscholar.org")
41 )
42 )
43 except (ValueError, AttributeError, TypeError):
44 return False
46 def download(
47 self, url: str, content_type: ContentType = ContentType.PDF
48 ) -> Optional[bytes]:
49 """Download content from Semantic Scholar."""
50 result = self.download_with_result(url, content_type)
51 return result.content if result.is_success else None
53 def download_with_result(
54 self, url: str, content_type: ContentType = ContentType.PDF
55 ) -> DownloadResult:
56 """Download PDF and return detailed result with skip reason."""
57 # Only support PDF downloads for now
58 if content_type != ContentType.PDF: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 return DownloadResult(
60 skip_reason="Text extraction not yet supported for Semantic Scholar"
61 )
63 # Extract paper ID from URL
64 paper_id = self._extract_paper_id(url)
65 if not paper_id: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true
66 return DownloadResult(
67 skip_reason="Invalid Semantic Scholar URL - could not extract paper ID"
68 )
70 logger.info(f"Looking up Semantic Scholar paper: {paper_id}")
72 # Get paper details from API to find PDF URL
73 pdf_url = self._get_pdf_url(paper_id)
75 if not pdf_url:
76 return DownloadResult(
77 skip_reason="Not open access - subscription required"
78 )
80 # Download the PDF from the open access URL
81 logger.info(f"Downloading open access PDF from: {pdf_url}")
82 pdf_content = super()._download_pdf(pdf_url)
84 if pdf_content:
85 return DownloadResult(content=pdf_content, is_success=True)
86 else:
87 return DownloadResult(
88 skip_reason="Open access PDF URL found but download failed"
89 )
91 def _extract_paper_id(self, url: str) -> Optional[str]:
92 """
93 Extract Semantic Scholar paper ID from URL.
95 Handles formats like:
96 - https://www.semanticscholar.org/paper/abc123...
97 - https://www.semanticscholar.org/paper/Title-Here/abc123...
99 Returns:
100 Paper ID (hash) or None if not found
101 """
102 # Use urlparse for more robust URL handling (handles query strings, fragments)
103 parsed = urlparse(url)
104 if not parsed.netloc or "semanticscholar.org" not in parsed.netloc:
105 return None
107 # Extract paper ID from path (40 character hex hash)
108 # Handles /paper/{hash} or /paper/{title}/{hash}
109 path = parsed.path
110 match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path)
111 return match.group(1) if match else None
113 def _get_pdf_url(self, paper_id: str) -> Optional[str]:
114 """
115 Get open access PDF URL from Semantic Scholar API.
117 Args:
118 paper_id: Semantic Scholar paper ID (hash)
120 Returns:
121 PDF URL if available, None otherwise
122 """
123 try:
124 # Construct API request
125 api_url = f"{self.base_api_url}/paper/{paper_id}"
126 params = {"fields": "openAccessPdf"}
128 # Add API key header if available
129 headers = {}
130 if self.api_key: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true
131 headers["x-api-key"] = self.api_key
133 # Make API request
134 response = self.session.get(
135 api_url, params=params, headers=headers, timeout=self.timeout
136 )
138 if response.status_code == 200:
139 data = response.json()
141 # Extract PDF URL from openAccessPdf field
142 open_access_pdf = data.get("openAccessPdf")
143 if open_access_pdf and isinstance(open_access_pdf, dict): 143 ↛ 151line 143 didn't jump to line 151 because the condition on line 143 was always true
144 pdf_url = open_access_pdf.get("url")
145 if pdf_url: 145 ↛ 151line 145 didn't jump to line 151 because the condition on line 145 was always true
146 logger.info(
147 f"Found open access PDF for paper {paper_id}: {pdf_url}"
148 )
149 return pdf_url
151 logger.info(
152 f"No open access PDF available for paper {paper_id}"
153 )
154 return None
156 elif response.status_code == 404:
157 logger.warning(
158 f"Paper not found in Semantic Scholar: {paper_id}"
159 )
160 return None
161 else:
162 logger.warning(
163 f"Semantic Scholar API error: {response.status_code}"
164 )
165 return None
167 except requests.exceptions.RequestException:
168 logger.exception("Failed to query Semantic Scholar API")
169 return None
170 except ValueError:
171 # JSON decode errors are expected runtime errors
172 logger.exception("Failed to parse Semantic Scholar API response")
173 return None
174 # Note: KeyError and TypeError are not caught - they indicate programming
175 # bugs that should propagate for debugging