Coverage for src / local_deep_research / research_library / downloaders / openalex.py: 63%
88 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2OpenAlex PDF Downloader
4Downloads PDFs from OpenAlex using their API to find open access PDFs.
5OpenAlex aggregates open access information from multiple sources.
6"""
8import re
9from typing import Optional
10from urllib.parse import urlparse
12import requests
13from loguru import logger
15from .base import BaseDownloader, ContentType, DownloadResult
18class OpenAlexDownloader(BaseDownloader):
19 """Downloader for OpenAlex papers with open access PDF support."""
21 def __init__(
22 self, timeout: int = 30, polite_pool_email: Optional[str] = None
23 ):
24 """
25 Initialize OpenAlex downloader.
27 Args:
28 timeout: Request timeout in seconds
29 polite_pool_email: Optional email for polite pool (faster API access)
30 """
31 super().__init__(timeout)
32 self.polite_pool_email = polite_pool_email
33 self.base_api_url = "https://api.openalex.org"
35 def can_handle(self, url: str) -> bool:
36 """Check if URL is from OpenAlex."""
37 try:
38 hostname = urlparse(url).hostname
39 return bool(
40 hostname
41 and (
42 hostname == "openalex.org"
43 or hostname.endswith(".openalex.org")
44 )
45 )
46 except (ValueError, AttributeError, TypeError):
47 return False
49 def download(
50 self, url: str, content_type: ContentType = ContentType.PDF
51 ) -> Optional[bytes]:
52 """Download content from OpenAlex."""
53 result = self.download_with_result(url, content_type)
54 return result.content if result.is_success else None
56 def download_with_result(
57 self, url: str, content_type: ContentType = ContentType.PDF
58 ) -> DownloadResult:
59 """Download PDF and return detailed result with skip reason."""
60 # Only support PDF downloads for now
61 if content_type != ContentType.PDF: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 return DownloadResult(
63 skip_reason="Text extraction not yet supported for OpenAlex"
64 )
66 # Extract work ID from URL
67 work_id = self._extract_work_id(url)
68 if not work_id: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 return DownloadResult(
70 skip_reason="Invalid OpenAlex URL - could not extract work ID"
71 )
73 logger.info(f"Looking up OpenAlex work: {work_id}")
75 # Get work details from API to find PDF URL
76 pdf_url = self._get_pdf_url(work_id)
78 if not pdf_url:
79 return DownloadResult(
80 skip_reason="Not open access - no free PDF available"
81 )
83 # Download the PDF from the open access URL
84 logger.info(f"Downloading open access PDF from: {pdf_url}")
85 pdf_content = super()._download_pdf(pdf_url)
87 if pdf_content: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 return DownloadResult(content=pdf_content, is_success=True)
89 else:
90 return DownloadResult(
91 skip_reason="Open access PDF URL found but download failed"
92 )
94 def _extract_work_id(self, url: str) -> Optional[str]:
95 """
96 Extract OpenAlex work ID from URL.
98 Handles formats like:
99 - https://openalex.org/W123456789
100 - https://openalex.org/works/W123456789
102 Returns:
103 Work ID (e.g., W123456789) or None if not found
104 """
105 # Use urlparse for more robust URL handling (handles query strings, fragments)
106 parsed = urlparse(url)
107 if not parsed.netloc or "openalex.org" not in parsed.netloc:
108 return None
110 # Extract work ID from path (W followed by digits)
111 # Handles /works/W123 or /W123
112 path = parsed.path
113 match = re.search(r"(?:/works/)?(W\d+)", path)
114 return match.group(1) if match else None
116 def _get_pdf_url(self, work_id: str) -> Optional[str]:
117 """
118 Get open access PDF URL from OpenAlex API.
120 Args:
121 work_id: OpenAlex work ID (e.g., W123456789)
123 Returns:
124 PDF URL if available, None otherwise
125 """
126 try:
127 # Construct API request
128 api_url = f"{self.base_api_url}/works/{work_id}"
129 params = {"select": "id,open_access,best_oa_location"}
131 # Add polite pool email if available (gets faster API access)
132 headers = {}
133 if self.polite_pool_email: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 headers["User-Agent"] = f"mailto:{self.polite_pool_email}"
136 # Make API request
137 response = self.session.get(
138 api_url, params=params, headers=headers, timeout=self.timeout
139 )
141 if response.status_code == 200:
142 data = response.json()
144 # Check if it's open access
145 open_access_info = data.get("open_access", {})
146 is_oa = open_access_info.get("is_oa", False)
148 if not is_oa: 148 ↛ 149line 148 didn't jump to line 149 because the condition on line 148 was never true
149 logger.info(f"Work {work_id} is not open access")
150 return None
152 # Get PDF URL from best open access location
153 best_oa_location = data.get("best_oa_location", {})
154 if best_oa_location: 154 ↛ 193line 154 didn't jump to line 193 because the condition on line 154 was always true
155 # Try pdf_url first, fall back to landing_page_url
156 pdf_url = best_oa_location.get("pdf_url")
157 if pdf_url: 157 ↛ 164line 157 didn't jump to line 164 because the condition on line 157 was always true
158 logger.info(
159 f"Found open access PDF for work {work_id}: {pdf_url}"
160 )
161 return pdf_url
163 # Some works have landing page but no direct PDF
164 landing_url = best_oa_location.get("landing_page_url")
165 if landing_url:
166 logger.info(
167 f"Found landing page for work {work_id}: {landing_url}"
168 )
169 # Validate that landing page is actually a PDF before returning
170 try:
171 head_response = self.session.head(
172 landing_url,
173 timeout=self.timeout,
174 allow_redirects=True,
175 )
176 content_type = head_response.headers.get(
177 "Content-Type", ""
178 ).lower()
179 if "application/pdf" in content_type:
180 logger.info(
181 f"Landing page is a direct PDF link for work {work_id}"
182 )
183 return landing_url
184 else:
185 logger.info(
186 f"Landing page is not a PDF (Content-Type: {content_type}), skipping"
187 )
188 except Exception:
189 logger.exception(
190 f"Failed to validate landing page URL for work {work_id}"
191 )
193 logger.info(
194 f"No PDF URL available for open access work {work_id}"
195 )
196 return None
198 elif response.status_code == 404: 198 ↛ 202line 198 didn't jump to line 202 because the condition on line 198 was always true
199 logger.warning(f"Work not found in OpenAlex: {work_id}")
200 return None
201 else:
202 logger.warning(f"OpenAlex API error: {response.status_code}")
203 return None
205 except requests.exceptions.RequestException:
206 logger.exception("Failed to query OpenAlex API")
207 return None
208 except ValueError:
209 # JSON decode errors are expected runtime errors
210 logger.exception("Failed to parse OpenAlex API response")
211 return None
212 # Note: KeyError and TypeError are not caught - they indicate programming
213 # bugs that should propagate for debugging