Coverage for src/local_deep_research/research_library/downloaders/openalex.py: 63%

1"""

2OpenAlex PDF Downloader

4Downloads PDFs from OpenAlex using their API to find open access PDFs.

5OpenAlex aggregates open access information from multiple sources.

6"""

8import re

9from typing import Optional

10from urllib.parse import urlparse

12import requests

13from loguru import logger

15from .base import BaseDownloader, ContentType, DownloadResult

18class OpenAlexDownloader(BaseDownloader):

19 """Downloader for OpenAlex papers with open access PDF support."""

21 def __init__(

22 self, timeout: int = 30, polite_pool_email: Optional[str] = None

23 ):

24 """

25 Initialize OpenAlex downloader.

27 Args:

28 timeout: Request timeout in seconds

29 polite_pool_email: Optional email for polite pool (faster API access)

30 """

31 super().__init__(timeout)

32 self.polite_pool_email = polite_pool_email

33 self.base_api_url = "https://api.openalex.org"

35 def can_handle(self, url: str) -> bool:

36 """Check if URL is from OpenAlex."""

37 try:

38 hostname = urlparse(url).hostname

39 return bool(

40 hostname

41 and (

42 hostname == "openalex.org"

43 or hostname.endswith(".openalex.org")

44 )

45 )

46 except (ValueError, AttributeError, TypeError):

47 return False

49 def download(

50 self, url: str, content_type: ContentType = ContentType.PDF

51 ) -> Optional[bytes]:

52 """Download content from OpenAlex."""

53 result = self.download_with_result(url, content_type)

54 return result.content if result.is_success else None

56 def download_with_result(

57 self, url: str, content_type: ContentType = ContentType.PDF

58 ) -> DownloadResult:

59 """Download PDF and return detailed result with skip reason."""

60 # Only support PDF downloads for now

61 if content_type != ContentType.PDF: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 return DownloadResult(

63 skip_reason="Text extraction not yet supported for OpenAlex"

64 )

66 # Extract work ID from URL

67 work_id = self._extract_work_id(url)

68 if not work_id: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 return DownloadResult(

70 skip_reason="Invalid OpenAlex URL - could not extract work ID"

71 )

73 logger.info(f"Looking up OpenAlex work: {work_id}")

75 # Get work details from API to find PDF URL

76 pdf_url = self._get_pdf_url(work_id)

78 if not pdf_url:

79 return DownloadResult(

80 skip_reason="Not open access - no free PDF available"

81 )

83 # Download the PDF from the open access URL

84 logger.info(f"Downloading open access PDF from: {pdf_url}")

85 pdf_content = super()._download_pdf(pdf_url)

87 if pdf_content: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 return DownloadResult(content=pdf_content, is_success=True)

89 else:

90 return DownloadResult(

91 skip_reason="Open access PDF URL found but download failed"

92 )

94 def _extract_work_id(self, url: str) -> Optional[str]:

95 """

96 Extract OpenAlex work ID from URL.

98 Handles formats like:

99 - https://openalex.org/W123456789

100 - https://openalex.org/works/W123456789

101

102 Returns:

103 Work ID (e.g., W123456789) or None if not found

104 """

105 # Use urlparse for more robust URL handling (handles query strings, fragments)

106 parsed = urlparse(url)

107 if not parsed.netloc or "openalex.org" not in parsed.netloc:

108 return None

109

110 # Extract work ID from path (W followed by digits)

111 # Handles /works/W123 or /W123

112 path = parsed.path

113 match = re.search(r"(?:/works/)?(W\d+)", path)

114 return match.group(1) if match else None

115

116 def _get_pdf_url(self, work_id: str) -> Optional[str]:

117 """

118 Get open access PDF URL from OpenAlex API.

119

120 Args:

121 work_id: OpenAlex work ID (e.g., W123456789)

122

123 Returns:

124 PDF URL if available, None otherwise

125 """

126 try:

127 # Construct API request

128 api_url = f"{self.base_api_url}/works/{work_id}"

129 params = {"select": "id,open_access,best_oa_location"}

130

131 # Add polite pool email if available (gets faster API access)

132 headers = {}

133 if self.polite_pool_email: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 headers["User-Agent"] = f"mailto:{self.polite_pool_email}"

135

136 # Make API request

137 response = self.session.get(

138 api_url, params=params, headers=headers, timeout=self.timeout

139 )

140

141 if response.status_code == 200:

142 data = response.json()

143

144 # Check if it's open access

145 open_access_info = data.get("open_access", {})

146 is_oa = open_access_info.get("is_oa", False)

147

148 if not is_oa: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true

149 logger.info(f"Work {work_id} is not open access")

150 return None

151

152 # Get PDF URL from best open access location

153 best_oa_location = data.get("best_oa_location", {})

154 if best_oa_location: 154 ↛ 193line 154 didn't jump to line 193 because the condition on line 154 was always true

155 # Try pdf_url first, fall back to landing_page_url

156 pdf_url = best_oa_location.get("pdf_url")

157 if pdf_url: 157 ↛ 164line 157 didn't jump to line 164 because the condition on line 157 was always true

158 logger.info(

159 f"Found open access PDF for work {work_id}: {pdf_url}"

160 )

161 return pdf_url

162

163 # Some works have landing page but no direct PDF

164 landing_url = best_oa_location.get("landing_page_url")

165 if landing_url:

166 logger.info(

167 f"Found landing page for work {work_id}: {landing_url}"

168 )

169 # Validate that landing page is actually a PDF before returning

170 try:

171 head_response = self.session.head(

172 landing_url,

173 timeout=self.timeout,

174 allow_redirects=True,

175 )

176 content_type = head_response.headers.get(

177 "Content-Type", ""

178 ).lower()

179 if "application/pdf" in content_type:

180 logger.info(

181 f"Landing page is a direct PDF link for work {work_id}"

182 )

183 return landing_url

184 else:

185 logger.info(

186 f"Landing page is not a PDF (Content-Type: {content_type}), skipping"

187 )

188 except Exception:

189 logger.exception(

190 f"Failed to validate landing page URL for work {work_id}"

191 )

192

193 logger.info(

194 f"No PDF URL available for open access work {work_id}"

195 )

196 return None

197

198 elif response.status_code == 404: 198 ↛ 202line 198 didn't jump to line 202 because the condition on line 198 was always true

199 logger.warning(f"Work not found in OpenAlex: {work_id}")

200 return None

201 else:

202 logger.warning(f"OpenAlex API error: {response.status_code}")

203 return None

204

205 except requests.exceptions.RequestException:

206 logger.exception("Failed to query OpenAlex API")

207 return None

208 except ValueError:

209 # JSON decode errors are expected runtime errors

210 logger.exception("Failed to parse OpenAlex API response")

211 return None

212 # Note: KeyError and TypeError are not caught - they indicate programming

213 # bugs that should propagate for debugging

Coverage for src / local_deep_research / research_library / downloaders / openalex.py: 63%

88 statements