Coverage for src / local_deep_research / research_library / downloaders / semantic_scholar.py: 95%

71 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Semantic Scholar PDF Downloader 

3 

4Downloads PDFs from Semantic Scholar using their API to find open access PDFs. 

5""" 

6 

7import re 

8from typing import Optional 

9from urllib.parse import urlparse 

10 

11import requests 

12from loguru import logger 

13 

14from .base import BaseDownloader, ContentType, DownloadResult 

15 

16 

17class SemanticScholarDownloader(BaseDownloader): 

18 """Downloader for Semantic Scholar papers with open access PDF support.""" 

19 

20 def __init__(self, timeout: int = 30, api_key: Optional[str] = None): 

21 """ 

22 Initialize Semantic Scholar downloader. 

23 

24 Args: 

25 timeout: Request timeout in seconds 

26 api_key: Optional Semantic Scholar API key for higher rate limits 

27 """ 

28 super().__init__(timeout) 

29 self.api_key = api_key 

30 self.base_api_url = "https://api.semanticscholar.org/graph/v1" 

31 

32 def can_handle(self, url: str) -> bool: 

33 """Check if URL is from Semantic Scholar.""" 

34 try: 

35 hostname = urlparse(url).hostname 

36 return bool( 

37 hostname 

38 and ( 

39 hostname == "semanticscholar.org" 

40 or hostname.endswith(".semanticscholar.org") 

41 ) 

42 ) 

43 except (ValueError, AttributeError, TypeError): 

44 return False 

45 

46 def download( 

47 self, url: str, content_type: ContentType = ContentType.PDF 

48 ) -> Optional[bytes]: 

49 """Download content from Semantic Scholar.""" 

50 result = self.download_with_result(url, content_type) 

51 return result.content if result.is_success else None 

52 

53 def download_with_result( 

54 self, url: str, content_type: ContentType = ContentType.PDF 

55 ) -> DownloadResult: 

56 """Download PDF and return detailed result with skip reason.""" 

57 # Only support PDF downloads for now 

58 if content_type != ContentType.PDF: 

59 return DownloadResult( 

60 skip_reason="Text extraction not yet supported for Semantic Scholar" 

61 ) 

62 

63 # Extract paper ID from URL 

64 paper_id = self._extract_paper_id(url) 

65 if not paper_id: 

66 return DownloadResult( 

67 skip_reason="Invalid Semantic Scholar URL - could not extract paper ID" 

68 ) 

69 

70 logger.info(f"Looking up Semantic Scholar paper: {paper_id}") 

71 

72 # Get paper details from API to find PDF URL 

73 pdf_url = self._get_pdf_url(paper_id) 

74 

75 if not pdf_url: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 return DownloadResult( 

77 skip_reason="Not open access - subscription required" 

78 ) 

79 

80 # Download the PDF from the open access URL 

81 logger.info(f"Downloading open access PDF from: {pdf_url}") 

82 pdf_content = super()._download_pdf(pdf_url) 

83 

84 if pdf_content: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 return DownloadResult(content=pdf_content, is_success=True) 

86 return DownloadResult( 

87 skip_reason="Open access PDF URL found but download failed" 

88 ) 

89 

90 def _extract_paper_id(self, url: str) -> Optional[str]: 

91 """ 

92 Extract Semantic Scholar paper ID from URL. 

93 

94 Handles formats like: 

95 - https://www.semanticscholar.org/paper/abc123... 

96 - https://www.semanticscholar.org/paper/Title-Here/abc123... 

97 

98 Returns: 

99 Paper ID (hash) or None if not found 

100 """ 

101 # Use urlparse for more robust URL handling (handles query strings, fragments) 

102 parsed = urlparse(url) 

103 if not parsed.netloc or "semanticscholar.org" not in parsed.netloc: 

104 return None 

105 

106 # Extract paper ID from path (40 character hex hash) 

107 # Handles /paper/{hash} or /paper/{title}/{hash} 

108 path = parsed.path 

109 match = re.search(r"/paper/(?:[^/]+/)?([a-f0-9]{40})", path) 

110 return match.group(1) if match else None 

111 

112 def _get_pdf_url(self, paper_id: str) -> Optional[str]: 

113 """ 

114 Get open access PDF URL from Semantic Scholar API. 

115 

116 Args: 

117 paper_id: Semantic Scholar paper ID (hash) 

118 

119 Returns: 

120 PDF URL if available, None otherwise 

121 """ 

122 try: 

123 # Construct API request 

124 api_url = f"{self.base_api_url}/paper/{paper_id}" 

125 params = {"fields": "openAccessPdf"} 

126 

127 # Add API key header if available 

128 headers = {} 

129 if self.api_key: 

130 headers["x-api-key"] = self.api_key 

131 

132 # Make API request 

133 response = self.session.get( 

134 api_url, params=params, headers=headers, timeout=self.timeout 

135 ) 

136 

137 if response.status_code == 200: 

138 data = response.json() 

139 

140 # Extract PDF URL from openAccessPdf field 

141 open_access_pdf = data.get("openAccessPdf") 

142 if open_access_pdf and isinstance(open_access_pdf, dict): 

143 pdf_url = open_access_pdf.get("url") 

144 if pdf_url: 144 ↛ 150line 144 didn't jump to line 150 because the condition on line 144 was always true

145 logger.info( 

146 f"Found open access PDF for paper {paper_id}: {pdf_url}" 

147 ) 

148 return str(pdf_url) 

149 

150 logger.info( 

151 f"No open access PDF available for paper {paper_id}" 

152 ) 

153 return None 

154 

155 if response.status_code == 404: 

156 logger.warning( 

157 f"Paper not found in Semantic Scholar: {paper_id}" 

158 ) 

159 return None 

160 logger.warning( 

161 f"Semantic Scholar API error: {response.status_code}" 

162 ) 

163 return None 

164 

165 except requests.exceptions.RequestException: 

166 logger.exception("Failed to query Semantic Scholar API") 

167 return None 

168 except ValueError: 

169 # JSON decode errors are expected runtime errors 

170 logger.exception("Failed to parse Semantic Scholar API response") 

171 return None 

172 # Note: KeyError and TypeError are not caught - they indicate programming 

173 # bugs that should propagate for debugging