Coverage for src / local_deep_research / research_library / downloaders / openalex.py: 63%

88 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2OpenAlex PDF Downloader 

3 

4Downloads PDFs from OpenAlex using their API to find open access PDFs. 

5OpenAlex aggregates open access information from multiple sources. 

6""" 

7 

8import re 

9from typing import Optional 

10from urllib.parse import urlparse 

11 

12import requests 

13from loguru import logger 

14 

15from .base import BaseDownloader, ContentType, DownloadResult 

16 

17 

18class OpenAlexDownloader(BaseDownloader): 

19 """Downloader for OpenAlex papers with open access PDF support.""" 

20 

21 def __init__( 

22 self, timeout: int = 30, polite_pool_email: Optional[str] = None 

23 ): 

24 """ 

25 Initialize OpenAlex downloader. 

26 

27 Args: 

28 timeout: Request timeout in seconds 

29 polite_pool_email: Optional email for polite pool (faster API access) 

30 """ 

31 super().__init__(timeout) 

32 self.polite_pool_email = polite_pool_email 

33 self.base_api_url = "https://api.openalex.org" 

34 

35 def can_handle(self, url: str) -> bool: 

36 """Check if URL is from OpenAlex.""" 

37 try: 

38 hostname = urlparse(url).hostname 

39 return bool( 

40 hostname 

41 and ( 

42 hostname == "openalex.org" 

43 or hostname.endswith(".openalex.org") 

44 ) 

45 ) 

46 except (ValueError, AttributeError, TypeError): 

47 return False 

48 

49 def download( 

50 self, url: str, content_type: ContentType = ContentType.PDF 

51 ) -> Optional[bytes]: 

52 """Download content from OpenAlex.""" 

53 result = self.download_with_result(url, content_type) 

54 return result.content if result.is_success else None 

55 

56 def download_with_result( 

57 self, url: str, content_type: ContentType = ContentType.PDF 

58 ) -> DownloadResult: 

59 """Download PDF and return detailed result with skip reason.""" 

60 # Only support PDF downloads for now 

61 if content_type != ContentType.PDF: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 return DownloadResult( 

63 skip_reason="Text extraction not yet supported for OpenAlex" 

64 ) 

65 

66 # Extract work ID from URL 

67 work_id = self._extract_work_id(url) 

68 if not work_id: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 return DownloadResult( 

70 skip_reason="Invalid OpenAlex URL - could not extract work ID" 

71 ) 

72 

73 logger.info(f"Looking up OpenAlex work: {work_id}") 

74 

75 # Get work details from API to find PDF URL 

76 pdf_url = self._get_pdf_url(work_id) 

77 

78 if not pdf_url: 

79 return DownloadResult( 

80 skip_reason="Not open access - no free PDF available" 

81 ) 

82 

83 # Download the PDF from the open access URL 

84 logger.info(f"Downloading open access PDF from: {pdf_url}") 

85 pdf_content = super()._download_pdf(pdf_url) 

86 

87 if pdf_content: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 return DownloadResult(content=pdf_content, is_success=True) 

89 else: 

90 return DownloadResult( 

91 skip_reason="Open access PDF URL found but download failed" 

92 ) 

93 

94 def _extract_work_id(self, url: str) -> Optional[str]: 

95 """ 

96 Extract OpenAlex work ID from URL. 

97 

98 Handles formats like: 

99 - https://openalex.org/W123456789 

100 - https://openalex.org/works/W123456789 

101 

102 Returns: 

103 Work ID (e.g., W123456789) or None if not found 

104 """ 

105 # Use urlparse for more robust URL handling (handles query strings, fragments) 

106 parsed = urlparse(url) 

107 if not parsed.netloc or "openalex.org" not in parsed.netloc: 

108 return None 

109 

110 # Extract work ID from path (W followed by digits) 

111 # Handles /works/W123 or /W123 

112 path = parsed.path 

113 match = re.search(r"(?:/works/)?(W\d+)", path) 

114 return match.group(1) if match else None 

115 

116 def _get_pdf_url(self, work_id: str) -> Optional[str]: 

117 """ 

118 Get open access PDF URL from OpenAlex API. 

119 

120 Args: 

121 work_id: OpenAlex work ID (e.g., W123456789) 

122 

123 Returns: 

124 PDF URL if available, None otherwise 

125 """ 

126 try: 

127 # Construct API request 

128 api_url = f"{self.base_api_url}/works/{work_id}" 

129 params = {"select": "id,open_access,best_oa_location"} 

130 

131 # Add polite pool email if available (gets faster API access) 

132 headers = {} 

133 if self.polite_pool_email: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 headers["User-Agent"] = f"mailto:{self.polite_pool_email}" 

135 

136 # Make API request 

137 response = self.session.get( 

138 api_url, params=params, headers=headers, timeout=self.timeout 

139 ) 

140 

141 if response.status_code == 200: 

142 data = response.json() 

143 

144 # Check if it's open access 

145 open_access_info = data.get("open_access", {}) 

146 is_oa = open_access_info.get("is_oa", False) 

147 

148 if not is_oa: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 logger.info(f"Work {work_id} is not open access") 

150 return None 

151 

152 # Get PDF URL from best open access location 

153 best_oa_location = data.get("best_oa_location", {}) 

154 if best_oa_location: 154 ↛ 193line 154 didn't jump to line 193 because the condition on line 154 was always true

155 # Try pdf_url first, fall back to landing_page_url 

156 pdf_url = best_oa_location.get("pdf_url") 

157 if pdf_url: 157 ↛ 164line 157 didn't jump to line 164 because the condition on line 157 was always true

158 logger.info( 

159 f"Found open access PDF for work {work_id}: {pdf_url}" 

160 ) 

161 return pdf_url 

162 

163 # Some works have landing page but no direct PDF 

164 landing_url = best_oa_location.get("landing_page_url") 

165 if landing_url: 

166 logger.info( 

167 f"Found landing page for work {work_id}: {landing_url}" 

168 ) 

169 # Validate that landing page is actually a PDF before returning 

170 try: 

171 head_response = self.session.head( 

172 landing_url, 

173 timeout=self.timeout, 

174 allow_redirects=True, 

175 ) 

176 content_type = head_response.headers.get( 

177 "Content-Type", "" 

178 ).lower() 

179 if "application/pdf" in content_type: 

180 logger.info( 

181 f"Landing page is a direct PDF link for work {work_id}" 

182 ) 

183 return landing_url 

184 else: 

185 logger.info( 

186 f"Landing page is not a PDF (Content-Type: {content_type}), skipping" 

187 ) 

188 except Exception: 

189 logger.exception( 

190 f"Failed to validate landing page URL for work {work_id}" 

191 ) 

192 

193 logger.info( 

194 f"No PDF URL available for open access work {work_id}" 

195 ) 

196 return None 

197 

198 elif response.status_code == 404: 198 ↛ 202line 198 didn't jump to line 202 because the condition on line 198 was always true

199 logger.warning(f"Work not found in OpenAlex: {work_id}") 

200 return None 

201 else: 

202 logger.warning(f"OpenAlex API error: {response.status_code}") 

203 return None 

204 

205 except requests.exceptions.RequestException: 

206 logger.exception("Failed to query OpenAlex API") 

207 return None 

208 except ValueError: 

209 # JSON decode errors are expected runtime errors 

210 logger.exception("Failed to parse OpenAlex API response") 

211 return None 

212 # Note: KeyError and TypeError are not caught - they indicate programming 

213 # bugs that should propagate for debugging