Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 54%
94 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2bioRxiv/medRxiv PDF and Text Downloader
3"""
5import re
6from typing import Optional
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult
13class BioRxivDownloader(BaseDownloader):
14 """Downloader for bioRxiv and medRxiv preprints."""
16 def can_handle(self, url: str) -> bool:
17 """Check if URL is from bioRxiv or medRxiv."""
18 try:
19 hostname = urlparse(url).hostname
20 if not hostname:
21 return False
22 return (
23 hostname == "biorxiv.org"
24 or hostname.endswith(".biorxiv.org")
25 or hostname == "medrxiv.org"
26 or hostname.endswith(".medrxiv.org")
27 )
28 except Exception:
29 return False
31 def download(
32 self, url: str, content_type: ContentType = ContentType.PDF
33 ) -> Optional[bytes]:
34 """Download content from bioRxiv/medRxiv."""
35 if content_type == ContentType.TEXT: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 return self._download_text(url)
37 else:
38 return self._download_pdf(url)
40 def download_with_result(
41 self, url: str, content_type: ContentType = ContentType.PDF
42 ) -> DownloadResult:
43 """Download content and return detailed result with skip reason."""
44 if content_type == ContentType.TEXT: 44 ↛ 46line 44 didn't jump to line 46 because the condition on line 44 was never true
45 # Try to get text from page
46 text = self._fetch_abstract_from_page(url)
47 if text:
48 return DownloadResult(
49 content=text.encode("utf-8"), is_success=True
50 )
52 # Fallback to PDF extraction
53 pdf_content = self._download_pdf(url)
54 if pdf_content:
55 extracted_text = self.extract_text_from_pdf(pdf_content)
56 if extracted_text:
57 return DownloadResult(
58 content=extracted_text.encode("utf-8"), is_success=True
59 )
61 return DownloadResult(
62 skip_reason="Could not extract text from bioRxiv/medRxiv article"
63 )
64 else:
65 # Try to download PDF
66 pdf_url = self._convert_to_pdf_url(url)
67 if not pdf_url:
68 return DownloadResult(
69 skip_reason="Invalid bioRxiv/medRxiv URL format"
70 )
72 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
73 pdf_content = super()._download_pdf(pdf_url)
75 if pdf_content: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 return DownloadResult(content=pdf_content, is_success=True)
77 else:
78 # Check if it's a server issue or article doesn't exist
79 try:
80 response = self.session.head(url, timeout=5)
81 if response.status_code == 404: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 return DownloadResult(
83 skip_reason="Article not found on bioRxiv/medRxiv"
84 )
85 elif response.status_code >= 500: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true
86 return DownloadResult(
87 skip_reason="bioRxiv/medRxiv server temporarily unavailable"
88 )
89 except:
90 pass
91 return DownloadResult(
92 skip_reason="Failed to download PDF from bioRxiv/medRxiv"
93 )
95 def _download_pdf(self, url: str) -> Optional[bytes]:
96 """Download PDF from bioRxiv/medRxiv."""
97 # Convert URL to PDF format
98 pdf_url = self._convert_to_pdf_url(url)
100 if not pdf_url: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 logger.error(f"Could not convert to PDF URL: {url}")
102 return None
104 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")
105 return super()._download_pdf(pdf_url)
107 def _download_text(self, url: str) -> Optional[bytes]:
108 """Get text content from bioRxiv/medRxiv."""
109 # Try to get abstract and metadata from the HTML page
110 text = self._fetch_abstract_from_page(url)
111 if text:
112 return text.encode("utf-8")
114 # Fallback: Download PDF and extract text
115 pdf_content = self._download_pdf(url)
116 if pdf_content:
117 extracted_text = self.extract_text_from_pdf(pdf_content)
118 if extracted_text:
119 return extracted_text.encode("utf-8")
121 return None
123 def _convert_to_pdf_url(self, url: str) -> Optional[str]:
124 """Convert bioRxiv/medRxiv URL to PDF URL."""
125 # Handle different URL patterns
126 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1
127 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf
129 # Remove any existing .full or .full.pdf
130 base_url = re.sub(r"\.full(\.pdf)?$", "", url)
132 # Check if it's already a PDF URL
133 if base_url.endswith(".pdf"): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 return base_url
136 # Add .full.pdf
137 pdf_url = base_url.rstrip("/") + ".full.pdf"
139 # Handle content vs. content/early URLs
140 pdf_url = pdf_url.replace("/content/early/", "/content/")
142 return pdf_url
144 def _fetch_abstract_from_page(self, url: str) -> Optional[str]:
145 """Fetch abstract and metadata from bioRxiv/medRxiv page."""
146 try:
147 # Request the HTML page
148 response = self.session.get(url, timeout=10)
150 if response.status_code == 200: 150 ↛ 196line 150 didn't jump to line 196 because the condition on line 150 was always true
151 # Simple extraction using regex (avoiding BeautifulSoup dependency)
152 html = response.text
153 text_parts = []
155 # Extract title
156 title_match = re.search(
157 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html
158 )
159 if title_match: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 text_parts.append(f"Title: {title_match.group(1)}")
162 # Extract authors
163 author_match = re.search(
164 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html
165 )
166 if author_match: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 text_parts.append(f"Authors: {author_match.group(1)}")
169 # Extract abstract
170 abstract_match = re.search(
171 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"',
172 html,
173 re.DOTALL,
174 )
175 if abstract_match: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 abstract = abstract_match.group(1)
177 # Clean up HTML entities
178 abstract = abstract.replace("<", "<").replace(
179 ">", ">"
180 )
181 abstract = abstract.replace(""", '"').replace(
182 "'", "'"
183 )
184 abstract = abstract.replace("&", "&")
185 text_parts.append(f"\nAbstract:\n{abstract}")
187 if text_parts: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true
188 logger.info(
189 "Retrieved text content from bioRxiv/medRxiv page"
190 )
191 return "\n".join(text_parts)
193 except Exception as e:
194 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}")
196 return None