Coverage for src/local_deep_research/research_library/downloaders/semantic

1"""

2Semantic Scholar PDF Downloader

4Downloads PDFs from Semantic Scholar using their API to find open access PDFs.

5"""

7import re

8from typing import Optional

9from urllib.parse import urlparse

11import requests

12from loguru import logger

14from .base import BaseDownloader, ContentType, DownloadResult

17class SemanticScholarDownloader(BaseDownloader):

18 """Downloader for Semantic Scholar papers with open access PDF support."""

20 def __init__(self, timeout: int = 30, api_key: Optional[str] = None):

21 """

22 Initialize Semantic Scholar downloader.

24 Args:

25 timeout: Request timeout in seconds

26 api_key: Optional Semantic Scholar API key for higher rate limits

27 """

28 super().__init__(timeout)

29 self.api_key = api_key

30 self.base_api_url = "https://api.semanticscholar.org/graph/v1"

32 def can_handle(self, url: str) -> bool:

33 """Check if URL is from Semantic Scholar."""

34 try:

35 hostname = urlparse(url).hostname

36 return bool(

37 hostname

38 and (

39 hostname == "semanticscholar.org"

40 or hostname.endswith(".semanticscholar.org")

41 )

42 )

43 except (ValueError, AttributeError, TypeError):

44 return False

46 def download(

47 self, url: str, content_type: ContentType = ContentType.PDF

48 ) -> Optional[bytes]:

49 """Download content from Semantic Scholar."""

50 result = self.download_with_result(url, content_type)

51 return result.content if result.is_success else None

53 def download_with_result(

54 self, url: str, content_type: ContentType = ContentType.PDF

55 ) -> DownloadResult:

56 """Download PDF and return detailed result with skip reason."""

57 # Only support PDF downloads for now

58 if content_type != ContentType.PDF: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 return DownloadResult(

60 skip_reason="Text extraction not yet supported for Semantic Scholar"

61 )

63 # Extract paper ID from URL

64 paper_id = self._extract_paper_id(url)

65 if not paper_id: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true

66 return DownloadResult(

67 skip_reason="Invalid Semantic Scholar URL - could not extract paper ID"

68 )

70 logger.info(f"Looking up Semantic Scholar paper: {paper_id}")

72 # Get paper details from API to find PDF URL

73 pdf_url = self._get_pdf_url(paper_id)

75 if not pdf_url:

76 return DownloadResult(

77 skip_reason="Not open access - subscription required"

78 )

80 # Download the PDF from the open access URL

81 logger.info(f"Downloading open access PDF from: {pdf_url}")

82 pdf_content = super()._download_pdf(pdf_url)

84 if pdf_content:

85 return DownloadResult(content=pdf_content, is_success=True)

86 else:

87 return DownloadResult(

88 skip_reason="Open access PDF URL found but download failed"

89 )

91 def _extract_paper_id(self, url: str) -> Optional[str]:

92 """

93 Extract Semantic Scholar paper ID from URL.

95 Handles formats like:

96 - https://www.semanticscholar.org/paper/abc123...

97 - https://www.semanticscholar.org/paper/Title-Here/abc123...

99 Returns:

100 Paper ID (hash) or None if not found

101 """

102 # Use urlparse for more robust URL handling (handles query strings, fragments)

103 parsed = urlparse(url)

104 if not parsed.netloc or "semanticscholar.org" not in parsed.netloc:

105 return None

106

107 # Extract paper ID from path (40 character hex hash)

108 # Handles /paper/{hash} or /paper/{title}/{hash}

109 path = parsed.path

110 match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path)

111 return match.group(1) if match else None

112

113 def _get_pdf_url(self, paper_id: str) -> Optional[str]:

114 """

115 Get open access PDF URL from Semantic Scholar API.

116

117 Args:

118 paper_id: Semantic Scholar paper ID (hash)

119

120 Returns:

121 PDF URL if available, None otherwise

122 """

123 try:

124 # Construct API request

125 api_url = f"{self.base_api_url}/paper/{paper_id}"

126 params = {"fields": "openAccessPdf"}

127

128 # Add API key header if available

129 headers = {}

130 if self.api_key: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 headers["x-api-key"] = self.api_key

132

133 # Make API request

134 response = self.session.get(

135 api_url, params=params, headers=headers, timeout=self.timeout

136 )

137

138 if response.status_code == 200:

139 data = response.json()

140

141 # Extract PDF URL from openAccessPdf field

142 open_access_pdf = data.get("openAccessPdf")

143 if open_access_pdf and isinstance(open_access_pdf, dict): 143 ↛ 151line 143 didn't jump to line 151 because the condition on line 143 was always true

144 pdf_url = open_access_pdf.get("url")

145 if pdf_url: 145 ↛ 151line 145 didn't jump to line 151 because the condition on line 145 was always true

146 logger.info(

147 f"Found open access PDF for paper {paper_id}: {pdf_url}"

148 )

149 return pdf_url

150

151 logger.info(

152 f"No open access PDF available for paper {paper_id}"

153 )

154 return None

155

156 elif response.status_code == 404:

157 logger.warning(

158 f"Paper not found in Semantic Scholar: {paper_id}"

159 )

160 return None

161 else:

162 logger.warning(

163 f"Semantic Scholar API error: {response.status_code}"

164 )

165 return None

166

167 except requests.exceptions.RequestException:

168 logger.exception("Failed to query Semantic Scholar API")

169 return None

170 except ValueError:

171 # JSON decode errors are expected runtime errors

172 logger.exception("Failed to parse Semantic Scholar API response")

173 return None

174 # Note: KeyError and TypeError are not caught - they indicate programming

175 # bugs that should propagate for debugging

Coverage for src / local_deep_research / research_library / downloaders / semantic_scholar.py: 67%

71 statements