Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 99%
94 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2bioRxiv/medRxiv PDF and Text Downloader
3"""
5import re
6from typing import Dict, Optional
7from urllib.parse import urlparse
9import requests
10from loguru import logger
12from .base import BaseDownloader, ContentType, DownloadResult
15class BioRxivDownloader(BaseDownloader):
16 """Downloader for bioRxiv and medRxiv preprints."""
18 def can_handle(self, url: str) -> bool:
19 """Check if URL is from bioRxiv or medRxiv."""
20 try:
21 hostname = urlparse(url).hostname
22 if not hostname:
23 return False
24 return (
25 hostname == "biorxiv.org"
26 or hostname.endswith(".biorxiv.org")
27 or hostname == "medrxiv.org"
28 or hostname.endswith(".medrxiv.org")
29 )
30 except Exception:
31 return False
33 def download(
34 self, url: str, content_type: ContentType = ContentType.PDF
35 ) -> Optional[bytes]:
36 """Download content from bioRxiv/medRxiv."""
37 if content_type == ContentType.TEXT:
38 return self._download_text(url)
39 return self._download_pdf(url)
41 def download_with_result(
42 self, url: str, content_type: ContentType = ContentType.PDF
43 ) -> DownloadResult:
44 """Download content and return detailed result with skip reason."""
45 if content_type == ContentType.TEXT:
46 # Try to get text from page
47 text = self._fetch_abstract_from_page(url)
48 if text:
49 return DownloadResult(
50 content=text.encode("utf-8"), is_success=True
51 )
53 # Fallback to PDF extraction
54 pdf_content = self._download_pdf(url)
55 if pdf_content:
56 extracted_text = self.extract_text_from_pdf(pdf_content)
57 if extracted_text:
58 return DownloadResult(
59 content=extracted_text.encode("utf-8"), is_success=True
60 )
62 return DownloadResult(
63 skip_reason="Could not extract text from bioRxiv/medRxiv article"
64 )
65 # Try to download PDF
66 pdf_url = self._convert_to_pdf_url(url)
67 if not pdf_url:
68 return DownloadResult(
69 skip_reason="Invalid bioRxiv/medRxiv URL format"
70 )
72 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
73 pdf_content = super()._download_pdf(pdf_url)
75 if pdf_content:
76 return DownloadResult(content=pdf_content, is_success=True)
77 # Check if it's a server issue or article doesn't exist
78 try:
79 response = self.session.head(url, timeout=5)
80 if response.status_code == 404:
81 return DownloadResult(
82 skip_reason="Article not found on bioRxiv/medRxiv",
83 status_code=404,
84 )
85 if response.status_code >= 500:
86 return DownloadResult(
87 skip_reason="bioRxiv/medRxiv server temporarily unavailable",
88 status_code=response.status_code,
89 )
90 except requests.RequestException:
91 logger.debug("Failed to check bioRxiv/medRxiv URL: {}", url)
92 return DownloadResult(
93 skip_reason="Failed to download PDF from bioRxiv/medRxiv"
94 )
96 def _download_pdf(
97 self, url: str, headers: Optional[Dict[str, str]] = None
98 ) -> Optional[bytes]:
99 """Download PDF from bioRxiv/medRxiv."""
100 # Convert URL to PDF format
101 pdf_url = self._convert_to_pdf_url(url)
103 if not pdf_url:
104 logger.error(f"Could not convert to PDF URL: {url}")
105 return None
107 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
108 return super()._download_pdf(pdf_url)
110 def _download_text(self, url: str) -> Optional[bytes]:
111 """Get text content from bioRxiv/medRxiv."""
112 # Try to get abstract and metadata from the HTML page
113 text = self._fetch_abstract_from_page(url)
114 if text:
115 return text.encode("utf-8")
117 # Fallback: Download PDF and extract text
118 pdf_content = self._download_pdf(url)
119 if pdf_content:
120 extracted_text = self.extract_text_from_pdf(pdf_content)
121 if extracted_text:
122 return extracted_text.encode("utf-8")
124 return None
126 def _convert_to_pdf_url(self, url: str) -> Optional[str]:
127 """Convert bioRxiv/medRxiv URL to PDF URL."""
128 # Handle different URL patterns
129 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1
130 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf
132 # Remove any existing .full or .full.pdf
133 base_url = re.sub(r"\.full(\.pdf)?$", "", url)
135 # Check if it's already a PDF URL
136 if base_url.endswith(".pdf"):
137 return base_url
139 # Add .full.pdf
140 pdf_url = base_url.rstrip("/") + ".full.pdf"
142 # Handle content vs. content/early URLs
143 return pdf_url.replace("/content/early/", "/content/")
145 def _fetch_abstract_from_page(self, url: str) -> Optional[str]:
146 """Fetch abstract and metadata from bioRxiv/medRxiv page."""
147 try:
148 # Request the HTML page
149 response = self.session.get(url, timeout=10)
151 if response.status_code == 200:
152 # Simple extraction using regex (avoiding BeautifulSoup dependency)
153 html = response.text
154 text_parts = []
156 # Extract title
157 title_match = re.search(
158 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html
159 )
160 if title_match:
161 text_parts.append(f"Title: {title_match.group(1)}")
163 # Extract authors
164 author_match = re.search(
165 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html
166 )
167 if author_match:
168 text_parts.append(f"Authors: {author_match.group(1)}")
170 # Extract abstract
171 abstract_match = re.search(
172 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"',
173 html,
174 re.DOTALL,
175 )
176 if abstract_match:
177 abstract = abstract_match.group(1)
178 # Clean up HTML entities
179 abstract = abstract.replace("<", "<").replace(
180 ">", ">"
181 )
182 abstract = abstract.replace(""", '"').replace(
183 "'", "'"
184 )
185 abstract = abstract.replace("&", "&")
186 text_parts.append(f"\nAbstract:\n{abstract}")
188 if text_parts:
189 logger.info(
190 "Retrieved text content from bioRxiv/medRxiv page"
191 )
192 return "\n".join(text_parts)
194 except Exception as e:
195 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}")
197 return None