Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 99%

94 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2bioRxiv/medRxiv PDF and Text Downloader 

3""" 

4 

5import re 

6from typing import Dict, Optional 

7from urllib.parse import urlparse 

8 

9import requests 

10from loguru import logger 

11 

12from .base import BaseDownloader, ContentType, DownloadResult 

13 

14 

15class BioRxivDownloader(BaseDownloader): 

16 """Downloader for bioRxiv and medRxiv preprints.""" 

17 

18 def can_handle(self, url: str) -> bool: 

19 """Check if URL is from bioRxiv or medRxiv.""" 

20 try: 

21 hostname = urlparse(url).hostname 

22 if not hostname: 

23 return False 

24 return ( 

25 hostname == "biorxiv.org" 

26 or hostname.endswith(".biorxiv.org") 

27 or hostname == "medrxiv.org" 

28 or hostname.endswith(".medrxiv.org") 

29 ) 

30 except Exception: 

31 return False 

32 

33 def download( 

34 self, url: str, content_type: ContentType = ContentType.PDF 

35 ) -> Optional[bytes]: 

36 """Download content from bioRxiv/medRxiv.""" 

37 if content_type == ContentType.TEXT: 

38 return self._download_text(url) 

39 return self._download_pdf(url) 

40 

41 def download_with_result( 

42 self, url: str, content_type: ContentType = ContentType.PDF 

43 ) -> DownloadResult: 

44 """Download content and return detailed result with skip reason.""" 

45 if content_type == ContentType.TEXT: 

46 # Try to get text from page 

47 text = self._fetch_abstract_from_page(url) 

48 if text: 

49 return DownloadResult( 

50 content=text.encode("utf-8"), is_success=True 

51 ) 

52 

53 # Fallback to PDF extraction 

54 pdf_content = self._download_pdf(url) 

55 if pdf_content: 

56 extracted_text = self.extract_text_from_pdf(pdf_content) 

57 if extracted_text: 

58 return DownloadResult( 

59 content=extracted_text.encode("utf-8"), is_success=True 

60 ) 

61 

62 return DownloadResult( 

63 skip_reason="Could not extract text from bioRxiv/medRxiv article" 

64 ) 

65 # Try to download PDF 

66 pdf_url = self._convert_to_pdf_url(url) 

67 if not pdf_url: 

68 return DownloadResult( 

69 skip_reason="Invalid bioRxiv/medRxiv URL format" 

70 ) 

71 

72 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

73 pdf_content = super()._download_pdf(pdf_url) 

74 

75 if pdf_content: 

76 return DownloadResult(content=pdf_content, is_success=True) 

77 # Check if it's a server issue or article doesn't exist 

78 try: 

79 response = self.session.head(url, timeout=5) 

80 if response.status_code == 404: 

81 return DownloadResult( 

82 skip_reason="Article not found on bioRxiv/medRxiv", 

83 status_code=404, 

84 ) 

85 if response.status_code >= 500: 

86 return DownloadResult( 

87 skip_reason="bioRxiv/medRxiv server temporarily unavailable", 

88 status_code=response.status_code, 

89 ) 

90 except requests.RequestException: 

91 logger.debug("Failed to check bioRxiv/medRxiv URL: {}", url) 

92 return DownloadResult( 

93 skip_reason="Failed to download PDF from bioRxiv/medRxiv" 

94 ) 

95 

96 def _download_pdf( 

97 self, url: str, headers: Optional[Dict[str, str]] = None 

98 ) -> Optional[bytes]: 

99 """Download PDF from bioRxiv/medRxiv.""" 

100 # Convert URL to PDF format 

101 pdf_url = self._convert_to_pdf_url(url) 

102 

103 if not pdf_url: 

104 logger.error(f"Could not convert to PDF URL: {url}") 

105 return None 

106 

107 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

108 return super()._download_pdf(pdf_url) 

109 

110 def _download_text(self, url: str) -> Optional[bytes]: 

111 """Get text content from bioRxiv/medRxiv.""" 

112 # Try to get abstract and metadata from the HTML page 

113 text = self._fetch_abstract_from_page(url) 

114 if text: 

115 return text.encode("utf-8") 

116 

117 # Fallback: Download PDF and extract text 

118 pdf_content = self._download_pdf(url) 

119 if pdf_content: 

120 extracted_text = self.extract_text_from_pdf(pdf_content) 

121 if extracted_text: 

122 return extracted_text.encode("utf-8") 

123 

124 return None 

125 

126 def _convert_to_pdf_url(self, url: str) -> Optional[str]: 

127 """Convert bioRxiv/medRxiv URL to PDF URL.""" 

128 # Handle different URL patterns 

129 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1 

130 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf 

131 

132 # Remove any existing .full or .full.pdf 

133 base_url = re.sub(r"\.full(\.pdf)?$", "", url) 

134 

135 # Check if it's already a PDF URL 

136 if base_url.endswith(".pdf"): 

137 return base_url 

138 

139 # Add .full.pdf 

140 pdf_url = base_url.rstrip("/") + ".full.pdf" 

141 

142 # Handle content vs. content/early URLs 

143 return pdf_url.replace("/content/early/", "/content/") 

144 

145 def _fetch_abstract_from_page(self, url: str) -> Optional[str]: 

146 """Fetch abstract and metadata from bioRxiv/medRxiv page.""" 

147 try: 

148 # Request the HTML page 

149 response = self.session.get(url, timeout=10) 

150 

151 if response.status_code == 200: 

152 # Simple extraction using regex (avoiding BeautifulSoup dependency) 

153 html = response.text 

154 text_parts = [] 

155 

156 # Extract title 

157 title_match = re.search( 

158 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html 

159 ) 

160 if title_match: 

161 text_parts.append(f"Title: {title_match.group(1)}") 

162 

163 # Extract authors 

164 author_match = re.search( 

165 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html 

166 ) 

167 if author_match: 

168 text_parts.append(f"Authors: {author_match.group(1)}") 

169 

170 # Extract abstract 

171 abstract_match = re.search( 

172 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"', 

173 html, 

174 re.DOTALL, 

175 ) 

176 if abstract_match: 

177 abstract = abstract_match.group(1) 

178 # Clean up HTML entities 

179 abstract = abstract.replace("&lt;", "<").replace( 

180 "&gt;", ">" 

181 ) 

182 abstract = abstract.replace("&quot;", '"').replace( 

183 "&#39;", "'" 

184 ) 

185 abstract = abstract.replace("&amp;", "&") 

186 text_parts.append(f"\nAbstract:\n{abstract}") 

187 

188 if text_parts: 

189 logger.info( 

190 "Retrieved text content from bioRxiv/medRxiv page" 

191 ) 

192 return "\n".join(text_parts) 

193 

194 except Exception as e: 

195 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}") 

196 

197 return None