Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 54%
95 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2bioRxiv/medRxiv PDF and Text Downloader
3"""
5import re
6from typing import Optional
7from urllib.parse import urlparse
9import requests
10from loguru import logger
12from .base import BaseDownloader, ContentType, DownloadResult
15class BioRxivDownloader(BaseDownloader):
16 """Downloader for bioRxiv and medRxiv preprints."""
18 def can_handle(self, url: str) -> bool:
19 """Check if URL is from bioRxiv or medRxiv."""
20 try:
21 hostname = urlparse(url).hostname
22 if not hostname:
23 return False
24 return (
25 hostname == "biorxiv.org"
26 or hostname.endswith(".biorxiv.org")
27 or hostname == "medrxiv.org"
28 or hostname.endswith(".medrxiv.org")
29 )
30 except Exception:
31 return False
33 def download(
34 self, url: str, content_type: ContentType = ContentType.PDF
35 ) -> Optional[bytes]:
36 """Download content from bioRxiv/medRxiv."""
37 if content_type == ContentType.TEXT: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 return self._download_text(url)
39 else:
40 return self._download_pdf(url)
42 def download_with_result(
43 self, url: str, content_type: ContentType = ContentType.PDF
44 ) -> DownloadResult:
45 """Download content and return detailed result with skip reason."""
46 if content_type == ContentType.TEXT: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was never true
47 # Try to get text from page
48 text = self._fetch_abstract_from_page(url)
49 if text:
50 return DownloadResult(
51 content=text.encode("utf-8"), is_success=True
52 )
54 # Fallback to PDF extraction
55 pdf_content = self._download_pdf(url)
56 if pdf_content:
57 extracted_text = self.extract_text_from_pdf(pdf_content)
58 if extracted_text:
59 return DownloadResult(
60 content=extracted_text.encode("utf-8"), is_success=True
61 )
63 return DownloadResult(
64 skip_reason="Could not extract text from bioRxiv/medRxiv article"
65 )
66 else:
67 # Try to download PDF
68 pdf_url = self._convert_to_pdf_url(url)
69 if not pdf_url:
70 return DownloadResult(
71 skip_reason="Invalid bioRxiv/medRxiv URL format"
72 )
74 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
75 pdf_content = super()._download_pdf(pdf_url)
77 if pdf_content: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 return DownloadResult(content=pdf_content, is_success=True)
79 else:
80 # Check if it's a server issue or article doesn't exist
81 try:
82 response = self.session.head(url, timeout=5)
83 if response.status_code == 404: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 return DownloadResult(
85 skip_reason="Article not found on bioRxiv/medRxiv"
86 )
87 elif response.status_code >= 500: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 return DownloadResult(
89 skip_reason="bioRxiv/medRxiv server temporarily unavailable"
90 )
91 except requests.RequestException:
92 logger.debug("Failed to check bioRxiv/medRxiv URL: %s", url)
93 return DownloadResult(
94 skip_reason="Failed to download PDF from bioRxiv/medRxiv"
95 )
97 def _download_pdf(self, url: str) -> Optional[bytes]:
98 """Download PDF from bioRxiv/medRxiv."""
99 # Convert URL to PDF format
100 pdf_url = self._convert_to_pdf_url(url)
102 if not pdf_url: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true
103 logger.error(f"Could not convert to PDF URL: {url}")
104 return None
106 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
107 return super()._download_pdf(pdf_url)
109 def _download_text(self, url: str) -> Optional[bytes]:
110 """Get text content from bioRxiv/medRxiv."""
111 # Try to get abstract and metadata from the HTML page
112 text = self._fetch_abstract_from_page(url)
113 if text:
114 return text.encode("utf-8")
116 # Fallback: Download PDF and extract text
117 pdf_content = self._download_pdf(url)
118 if pdf_content:
119 extracted_text = self.extract_text_from_pdf(pdf_content)
120 if extracted_text:
121 return extracted_text.encode("utf-8")
123 return None
125 def _convert_to_pdf_url(self, url: str) -> Optional[str]:
126 """Convert bioRxiv/medRxiv URL to PDF URL."""
127 # Handle different URL patterns
128 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1
129 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf
131 # Remove any existing .full or .full.pdf
132 base_url = re.sub(r"\.full(\.pdf)?$", "", url)
134 # Check if it's already a PDF URL
135 if base_url.endswith(".pdf"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 return base_url
138 # Add .full.pdf
139 pdf_url = base_url.rstrip("/") + ".full.pdf"
141 # Handle content vs. content/early URLs
142 pdf_url = pdf_url.replace("/content/early/", "/content/")
144 return pdf_url
146 def _fetch_abstract_from_page(self, url: str) -> Optional[str]:
147 """Fetch abstract and metadata from bioRxiv/medRxiv page."""
148 try:
149 # Request the HTML page
150 response = self.session.get(url, timeout=10)
152 if response.status_code == 200: 152 ↛ 198line 152 didn't jump to line 198 because the condition on line 152 was always true
153 # Simple extraction using regex (avoiding BeautifulSoup dependency)
154 html = response.text
155 text_parts = []
157 # Extract title
158 title_match = re.search(
159 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html
160 )
161 if title_match: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 text_parts.append(f"Title: {title_match.group(1)}")
164 # Extract authors
165 author_match = re.search(
166 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html
167 )
168 if author_match: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 text_parts.append(f"Authors: {author_match.group(1)}")
171 # Extract abstract
172 abstract_match = re.search(
173 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"',
174 html,
175 re.DOTALL,
176 )
177 if abstract_match: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true
178 abstract = abstract_match.group(1)
179 # Clean up HTML entities
180 abstract = abstract.replace("<", "<").replace(
181 ">", ">"
182 )
183 abstract = abstract.replace(""", '"').replace(
184 "'", "'"
185 )
186 abstract = abstract.replace("&", "&")
187 text_parts.append(f"\nAbstract:\n{abstract}")
189 if text_parts: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 logger.info(
191 "Retrieved text content from bioRxiv/medRxiv page"
192 )
193 return "\n".join(text_parts)
195 except Exception as e:
196 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}")
198 return None