Coverage for src / local_deep_research / research_library / downloaders / openalex.py: 97%

88 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2OpenAlex PDF Downloader 

3 

4Downloads PDFs from OpenAlex using their API to find open access PDFs. 

5OpenAlex aggregates open access information from multiple sources. 

6""" 

7 

8import re 

9from typing import Optional 

10from urllib.parse import urlparse 

11 

12import requests 

13from loguru import logger 

14 

15from .base import BaseDownloader, ContentType, DownloadResult 

16 

17 

18class OpenAlexDownloader(BaseDownloader): 

19 """Downloader for OpenAlex papers with open access PDF support.""" 

20 

21 def __init__( 

22 self, timeout: int = 30, polite_pool_email: Optional[str] = None 

23 ): 

24 """ 

25 Initialize OpenAlex downloader. 

26 

27 Args: 

28 timeout: Request timeout in seconds 

29 polite_pool_email: Optional email for polite pool (faster API access) 

30 """ 

31 super().__init__(timeout) 

32 self.polite_pool_email = polite_pool_email 

33 self.base_api_url = "https://api.openalex.org" 

34 

35 def can_handle(self, url: str) -> bool: 

36 """Check if URL is from OpenAlex.""" 

37 try: 

38 hostname = urlparse(url).hostname 

39 return bool( 

40 hostname 

41 and ( 

42 hostname == "openalex.org" 

43 or hostname.endswith(".openalex.org") 

44 ) 

45 ) 

46 except (ValueError, AttributeError, TypeError): 

47 return False 

48 

49 def download( 

50 self, url: str, content_type: ContentType = ContentType.PDF 

51 ) -> Optional[bytes]: 

52 """Download content from OpenAlex.""" 

53 result = self.download_with_result(url, content_type) 

54 return result.content if result.is_success else None 

55 

56 def download_with_result( 

57 self, url: str, content_type: ContentType = ContentType.PDF 

58 ) -> DownloadResult: 

59 """Download PDF and return detailed result with skip reason.""" 

60 # Only support PDF downloads for now 

61 if content_type != ContentType.PDF: 

62 return DownloadResult( 

63 skip_reason="Text extraction not yet supported for OpenAlex" 

64 ) 

65 

66 # Extract work ID from URL 

67 work_id = self._extract_work_id(url) 

68 if not work_id: 

69 return DownloadResult( 

70 skip_reason="Invalid OpenAlex URL - could not extract work ID" 

71 ) 

72 

73 logger.info(f"Looking up OpenAlex work: {work_id}") 

74 

75 # Get work details from API to find PDF URL 

76 pdf_url = self._get_pdf_url(work_id) 

77 

78 if not pdf_url: 

79 return DownloadResult( 

80 skip_reason="Not open access - no free PDF available" 

81 ) 

82 

83 # Download the PDF from the open access URL 

84 logger.info(f"Downloading open access PDF from: {pdf_url}") 

85 pdf_content = super()._download_pdf(pdf_url) 

86 

87 if pdf_content: 

88 return DownloadResult(content=pdf_content, is_success=True) 

89 return DownloadResult( 

90 skip_reason="Open access PDF URL found but download failed" 

91 ) 

92 

93 def _extract_work_id(self, url: str) -> Optional[str]: 

94 """ 

95 Extract OpenAlex work ID from URL. 

96 

97 Handles formats like: 

98 - https://openalex.org/W123456789 

99 - https://openalex.org/works/W123456789 

100 

101 Returns: 

102 Work ID (e.g., W123456789) or None if not found 

103 """ 

104 # Use urlparse for more robust URL handling (handles query strings, fragments) 

105 parsed = urlparse(url) 

106 if not parsed.netloc or "openalex.org" not in parsed.netloc: 

107 return None 

108 

109 # Extract work ID from path (W followed by digits) 

110 # Handles /works/W123 or /W123 

111 path = parsed.path 

112 match = re.search(r"(?:/works/)?(W\d+)", path) 

113 return match.group(1) if match else None 

114 

115 def _get_pdf_url(self, work_id: str) -> Optional[str]: 

116 """ 

117 Get open access PDF URL from OpenAlex API. 

118 

119 Args: 

120 work_id: OpenAlex work ID (e.g., W123456789) 

121 

122 Returns: 

123 PDF URL if available, None otherwise 

124 """ 

125 try: 

126 # Construct API request 

127 api_url = f"{self.base_api_url}/works/{work_id}" 

128 params = {"select": "id,open_access,best_oa_location"} 

129 

130 # Add polite pool email if available (gets faster API access) 

131 headers = {} 

132 if self.polite_pool_email: 

133 headers["User-Agent"] = f"mailto:{self.polite_pool_email}" 

134 

135 # Make API request 

136 response = self.session.get( 

137 api_url, params=params, headers=headers, timeout=self.timeout 

138 ) 

139 

140 if response.status_code == 200: 

141 data = response.json() 

142 

143 # Check if it's open access 

144 open_access_info = data.get("open_access", {}) 

145 is_oa = open_access_info.get("is_oa", False) 

146 

147 if not is_oa: 

148 logger.info(f"Work {work_id} is not open access") 

149 return None 

150 

151 # Get PDF URL from best open access location 

152 best_oa_location = data.get("best_oa_location", {}) 

153 if best_oa_location: 

154 # Try pdf_url first, fall back to landing_page_url 

155 pdf_url = best_oa_location.get("pdf_url") 

156 if pdf_url: 

157 logger.info( 

158 f"Found open access PDF for work {work_id}: {pdf_url}" 

159 ) 

160 return str(pdf_url) 

161 

162 # Some works have landing page but no direct PDF 

163 landing_url = best_oa_location.get("landing_page_url") 

164 if landing_url: 164 ↛ 191line 164 didn't jump to line 191 because the condition on line 164 was always true

165 logger.info( 

166 f"Found landing page for work {work_id}: {landing_url}" 

167 ) 

168 # Validate that landing page is actually a PDF before returning 

169 try: 

170 head_response = self.session.head( 

171 landing_url, 

172 timeout=self.timeout, 

173 allow_redirects=True, 

174 ) 

175 content_type = head_response.headers.get( 

176 "Content-Type", "" 

177 ).lower() 

178 if "application/pdf" in content_type: 

179 logger.info( 

180 f"Landing page is a direct PDF link for work {work_id}" 

181 ) 

182 return str(landing_url) 

183 logger.info( 

184 f"Landing page is not a PDF (Content-Type: {content_type}), skipping" 

185 ) 

186 except Exception: 

187 logger.exception( 

188 f"Failed to validate landing page URL for work {work_id}" 

189 ) 

190 

191 logger.info( 

192 f"No PDF URL available for open access work {work_id}" 

193 ) 

194 return None 

195 

196 if response.status_code == 404: 

197 logger.warning(f"Work not found in OpenAlex: {work_id}") 

198 return None 

199 logger.warning(f"OpenAlex API error: {response.status_code}") 

200 return None 

201 

202 except requests.exceptions.RequestException: 

203 logger.exception("Failed to query OpenAlex API") 

204 return None 

205 except ValueError: 

206 # JSON decode errors are expected runtime errors 

207 logger.exception("Failed to parse OpenAlex API response") 

208 return None 

209 # Note: KeyError and TypeError are not caught - they indicate programming 

210 # bugs that should propagate for debugging