Coverage for src / local_deep_research / research_library / downloaders / semantic_scholar.py: 67%

71 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Semantic Scholar PDF Downloader 

3 

4Downloads PDFs from Semantic Scholar using their API to find open access PDFs. 

5""" 

6 

7import re 

8from typing import Optional 

9from urllib.parse import urlparse 

10 

11import requests 

12from loguru import logger 

13 

14from .base import BaseDownloader, ContentType, DownloadResult 

15 

16 

17class SemanticScholarDownloader(BaseDownloader): 

18 """Downloader for Semantic Scholar papers with open access PDF support.""" 

19 

20 def __init__(self, timeout: int = 30, api_key: Optional[str] = None): 

21 """ 

22 Initialize Semantic Scholar downloader. 

23 

24 Args: 

25 timeout: Request timeout in seconds 

26 api_key: Optional Semantic Scholar API key for higher rate limits 

27 """ 

28 super().__init__(timeout) 

29 self.api_key = api_key 

30 self.base_api_url = "https://api.semanticscholar.org/graph/v1" 

31 

32 def can_handle(self, url: str) -> bool: 

33 """Check if URL is from Semantic Scholar.""" 

34 try: 

35 hostname = urlparse(url).hostname 

36 return bool( 

37 hostname 

38 and ( 

39 hostname == "semanticscholar.org" 

40 or hostname.endswith(".semanticscholar.org") 

41 ) 

42 ) 

43 except (ValueError, AttributeError, TypeError): 

44 return False 

45 

46 def download( 

47 self, url: str, content_type: ContentType = ContentType.PDF 

48 ) -> Optional[bytes]: 

49 """Download content from Semantic Scholar.""" 

50 result = self.download_with_result(url, content_type) 

51 return result.content if result.is_success else None 

52 

53 def download_with_result( 

54 self, url: str, content_type: ContentType = ContentType.PDF 

55 ) -> DownloadResult: 

56 """Download PDF and return detailed result with skip reason.""" 

57 # Only support PDF downloads for now 

58 if content_type != ContentType.PDF: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 return DownloadResult( 

60 skip_reason="Text extraction not yet supported for Semantic Scholar" 

61 ) 

62 

63 # Extract paper ID from URL 

64 paper_id = self._extract_paper_id(url) 

65 if not paper_id: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true

66 return DownloadResult( 

67 skip_reason="Invalid Semantic Scholar URL - could not extract paper ID" 

68 ) 

69 

70 logger.info(f"Looking up Semantic Scholar paper: {paper_id}") 

71 

72 # Get paper details from API to find PDF URL 

73 pdf_url = self._get_pdf_url(paper_id) 

74 

75 if not pdf_url: 

76 return DownloadResult( 

77 skip_reason="Not open access - subscription required" 

78 ) 

79 

80 # Download the PDF from the open access URL 

81 logger.info(f"Downloading open access PDF from: {pdf_url}") 

82 pdf_content = super()._download_pdf(pdf_url) 

83 

84 if pdf_content: 

85 return DownloadResult(content=pdf_content, is_success=True) 

86 else: 

87 return DownloadResult( 

88 skip_reason="Open access PDF URL found but download failed" 

89 ) 

90 

91 def _extract_paper_id(self, url: str) -> Optional[str]: 

92 """ 

93 Extract Semantic Scholar paper ID from URL. 

94 

95 Handles formats like: 

96 - https://www.semanticscholar.org/paper/abc123... 

97 - https://www.semanticscholar.org/paper/Title-Here/abc123... 

98 

99 Returns: 

100 Paper ID (hash) or None if not found 

101 """ 

102 # Use urlparse for more robust URL handling (handles query strings, fragments) 

103 parsed = urlparse(url) 

104 if not parsed.netloc or "semanticscholar.org" not in parsed.netloc: 

105 return None 

106 

107 # Extract paper ID from path (40 character hex hash) 

108 # Handles /paper/{hash} or /paper/{title}/{hash} 

109 path = parsed.path 

110 match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path) 

111 return match.group(1) if match else None 

112 

113 def _get_pdf_url(self, paper_id: str) -> Optional[str]: 

114 """ 

115 Get open access PDF URL from Semantic Scholar API. 

116 

117 Args: 

118 paper_id: Semantic Scholar paper ID (hash) 

119 

120 Returns: 

121 PDF URL if available, None otherwise 

122 """ 

123 try: 

124 # Construct API request 

125 api_url = f"{self.base_api_url}/paper/{paper_id}" 

126 params = {"fields": "openAccessPdf"} 

127 

128 # Add API key header if available 

129 headers = {} 

130 if self.api_key: 130 ↛ 131line 130 didn't jump to line 131 because the condition on line 130 was never true

131 headers["x-api-key"] = self.api_key 

132 

133 # Make API request 

134 response = self.session.get( 

135 api_url, params=params, headers=headers, timeout=self.timeout 

136 ) 

137 

138 if response.status_code == 200: 

139 data = response.json() 

140 

141 # Extract PDF URL from openAccessPdf field 

142 open_access_pdf = data.get("openAccessPdf") 

143 if open_access_pdf and isinstance(open_access_pdf, dict): 143 ↛ 151line 143 didn't jump to line 151 because the condition on line 143 was always true

144 pdf_url = open_access_pdf.get("url") 

145 if pdf_url: 145 ↛ 151line 145 didn't jump to line 151 because the condition on line 145 was always true

146 logger.info( 

147 f"Found open access PDF for paper {paper_id}: {pdf_url}" 

148 ) 

149 return pdf_url 

150 

151 logger.info( 

152 f"No open access PDF available for paper {paper_id}" 

153 ) 

154 return None 

155 

156 elif response.status_code == 404: 

157 logger.warning( 

158 f"Paper not found in Semantic Scholar: {paper_id}" 

159 ) 

160 return None 

161 else: 

162 logger.warning( 

163 f"Semantic Scholar API error: {response.status_code}" 

164 ) 

165 return None 

166 

167 except requests.exceptions.RequestException: 

168 logger.exception("Failed to query Semantic Scholar API") 

169 return None 

170 except ValueError: 

171 # JSON decode errors are expected runtime errors 

172 logger.exception("Failed to parse Semantic Scholar API response") 

173 return None 

174 # Note: KeyError and TypeError are not caught - they indicate programming 

175 # bugs that should propagate for debugging