Coverage for src / local_deep_research / research_library / downloaders / openalex.py: 97%
88 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2OpenAlex PDF Downloader
4Downloads PDFs from OpenAlex using their API to find open access PDFs.
5OpenAlex aggregates open access information from multiple sources.
6"""
8import re
9from typing import Optional
10from urllib.parse import urlparse
12import requests
13from loguru import logger
15from .base import BaseDownloader, ContentType, DownloadResult
18class OpenAlexDownloader(BaseDownloader):
19 """Downloader for OpenAlex papers with open access PDF support."""
21 def __init__(
22 self, timeout: int = 30, polite_pool_email: Optional[str] = None
23 ):
24 """
25 Initialize OpenAlex downloader.
27 Args:
28 timeout: Request timeout in seconds
29 polite_pool_email: Optional email for polite pool (faster API access)
30 """
31 super().__init__(timeout)
32 self.polite_pool_email = polite_pool_email
33 self.base_api_url = "https://api.openalex.org"
35 def can_handle(self, url: str) -> bool:
36 """Check if URL is from OpenAlex."""
37 try:
38 hostname = urlparse(url).hostname
39 return bool(
40 hostname
41 and (
42 hostname == "openalex.org"
43 or hostname.endswith(".openalex.org")
44 )
45 )
46 except (ValueError, AttributeError, TypeError):
47 return False
49 def download(
50 self, url: str, content_type: ContentType = ContentType.PDF
51 ) -> Optional[bytes]:
52 """Download content from OpenAlex."""
53 result = self.download_with_result(url, content_type)
54 return result.content if result.is_success else None
56 def download_with_result(
57 self, url: str, content_type: ContentType = ContentType.PDF
58 ) -> DownloadResult:
59 """Download PDF and return detailed result with skip reason."""
60 # Only support PDF downloads for now
61 if content_type != ContentType.PDF:
62 return DownloadResult(
63 skip_reason="Text extraction not yet supported for OpenAlex"
64 )
66 # Extract work ID from URL
67 work_id = self._extract_work_id(url)
68 if not work_id:
69 return DownloadResult(
70 skip_reason="Invalid OpenAlex URL - could not extract work ID"
71 )
73 logger.info(f"Looking up OpenAlex work: {work_id}")
75 # Get work details from API to find PDF URL
76 pdf_url = self._get_pdf_url(work_id)
78 if not pdf_url:
79 return DownloadResult(
80 skip_reason="Not open access - no free PDF available"
81 )
83 # Download the PDF from the open access URL
84 logger.info(f"Downloading open access PDF from: {pdf_url}")
85 pdf_content = super()._download_pdf(pdf_url)
87 if pdf_content:
88 return DownloadResult(content=pdf_content, is_success=True)
89 return DownloadResult(
90 skip_reason="Open access PDF URL found but download failed"
91 )
93 def _extract_work_id(self, url: str) -> Optional[str]:
94 """
95 Extract OpenAlex work ID from URL.
97 Handles formats like:
98 - https://openalex.org/W123456789
99 - https://openalex.org/works/W123456789
101 Returns:
102 Work ID (e.g., W123456789) or None if not found
103 """
104 # Use urlparse for more robust URL handling (handles query strings, fragments)
105 parsed = urlparse(url)
106 if not parsed.netloc or "openalex.org" not in parsed.netloc:
107 return None
109 # Extract work ID from path (W followed by digits)
110 # Handles /works/W123 or /W123
111 path = parsed.path
112 match = re.search(r"(?:/works/)?(W\d+)", path)
113 return match.group(1) if match else None
115 def _get_pdf_url(self, work_id: str) -> Optional[str]:
116 """
117 Get open access PDF URL from OpenAlex API.
119 Args:
120 work_id: OpenAlex work ID (e.g., W123456789)
122 Returns:
123 PDF URL if available, None otherwise
124 """
125 try:
126 # Construct API request
127 api_url = f"{self.base_api_url}/works/{work_id}"
128 params = {"select": "id,open_access,best_oa_location"}
130 # Add polite pool email if available (gets faster API access)
131 headers = {}
132 if self.polite_pool_email:
133 headers["User-Agent"] = f"mailto:{self.polite_pool_email}"
135 # Make API request
136 response = self.session.get(
137 api_url, params=params, headers=headers, timeout=self.timeout
138 )
140 if response.status_code == 200:
141 data = response.json()
143 # Check if it's open access
144 open_access_info = data.get("open_access", {})
145 is_oa = open_access_info.get("is_oa", False)
147 if not is_oa:
148 logger.info(f"Work {work_id} is not open access")
149 return None
151 # Get PDF URL from best open access location
152 best_oa_location = data.get("best_oa_location", {})
153 if best_oa_location:
154 # Try pdf_url first, fall back to landing_page_url
155 pdf_url = best_oa_location.get("pdf_url")
156 if pdf_url:
157 logger.info(
158 f"Found open access PDF for work {work_id}: {pdf_url}"
159 )
160 return str(pdf_url)
162 # Some works have landing page but no direct PDF
163 landing_url = best_oa_location.get("landing_page_url")
164 if landing_url: 164 ↛ 191line 164 didn't jump to line 191 because the condition on line 164 was always true
165 logger.info(
166 f"Found landing page for work {work_id}: {landing_url}"
167 )
168 # Validate that landing page is actually a PDF before returning
169 try:
170 head_response = self.session.head(
171 landing_url,
172 timeout=self.timeout,
173 allow_redirects=True,
174 )
175 content_type = head_response.headers.get(
176 "Content-Type", ""
177 ).lower()
178 if "application/pdf" in content_type:
179 logger.info(
180 f"Landing page is a direct PDF link for work {work_id}"
181 )
182 return str(landing_url)
183 logger.info(
184 f"Landing page is not a PDF (Content-Type: {content_type}), skipping"
185 )
186 except Exception:
187 logger.exception(
188 f"Failed to validate landing page URL for work {work_id}"
189 )
191 logger.info(
192 f"No PDF URL available for open access work {work_id}"
193 )
194 return None
196 if response.status_code == 404:
197 logger.warning(f"Work not found in OpenAlex: {work_id}")
198 return None
199 logger.warning(f"OpenAlex API error: {response.status_code}")
200 return None
202 except requests.exceptions.RequestException:
203 logger.exception("Failed to query OpenAlex API")
204 return None
205 except ValueError:
206 # JSON decode errors are expected runtime errors
207 logger.exception("Failed to parse OpenAlex API response")
208 return None
209 # Note: KeyError and TypeError are not caught - they indicate programming
210 # bugs that should propagate for debugging