Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 54%

94 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2bioRxiv/medRxiv PDF and Text Downloader 

3""" 

4 

5import re 

6from typing import Optional 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult 

11 

12 

13class BioRxivDownloader(BaseDownloader): 

14 """Downloader for bioRxiv and medRxiv preprints.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Check if URL is from bioRxiv or medRxiv.""" 

18 try: 

19 hostname = urlparse(url).hostname 

20 if not hostname: 

21 return False 

22 return ( 

23 hostname == "biorxiv.org" 

24 or hostname.endswith(".biorxiv.org") 

25 or hostname == "medrxiv.org" 

26 or hostname.endswith(".medrxiv.org") 

27 ) 

28 except Exception: 

29 return False 

30 

31 def download( 

32 self, url: str, content_type: ContentType = ContentType.PDF 

33 ) -> Optional[bytes]: 

34 """Download content from bioRxiv/medRxiv.""" 

35 if content_type == ContentType.TEXT: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 return self._download_text(url) 

37 else: 

38 return self._download_pdf(url) 

39 

40 def download_with_result( 

41 self, url: str, content_type: ContentType = ContentType.PDF 

42 ) -> DownloadResult: 

43 """Download content and return detailed result with skip reason.""" 

44 if content_type == ContentType.TEXT: 44 ↛ 46line 44 didn't jump to line 46 because the condition on line 44 was never true

45 # Try to get text from page 

46 text = self._fetch_abstract_from_page(url) 

47 if text: 

48 return DownloadResult( 

49 content=text.encode("utf-8"), is_success=True 

50 ) 

51 

52 # Fallback to PDF extraction 

53 pdf_content = self._download_pdf(url) 

54 if pdf_content: 

55 extracted_text = self.extract_text_from_pdf(pdf_content) 

56 if extracted_text: 

57 return DownloadResult( 

58 content=extracted_text.encode("utf-8"), is_success=True 

59 ) 

60 

61 return DownloadResult( 

62 skip_reason="Could not extract text from bioRxiv/medRxiv article" 

63 ) 

64 else: 

65 # Try to download PDF 

66 pdf_url = self._convert_to_pdf_url(url) 

67 if not pdf_url: 

68 return DownloadResult( 

69 skip_reason="Invalid bioRxiv/medRxiv URL format" 

70 ) 

71 

72 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

73 pdf_content = super()._download_pdf(pdf_url) 

74 

75 if pdf_content: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 return DownloadResult(content=pdf_content, is_success=True) 

77 else: 

78 # Check if it's a server issue or article doesn't exist 

79 try: 

80 response = self.session.head(url, timeout=5) 

81 if response.status_code == 404: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 return DownloadResult( 

83 skip_reason="Article not found on bioRxiv/medRxiv" 

84 ) 

85 elif response.status_code >= 500: 85 ↛ 86line 85 didn't jump to line 86 because the condition on line 85 was never true

86 return DownloadResult( 

87 skip_reason="bioRxiv/medRxiv server temporarily unavailable" 

88 ) 

89 except: 

90 pass 

91 return DownloadResult( 

92 skip_reason="Failed to download PDF from bioRxiv/medRxiv" 

93 ) 

94 

95 def _download_pdf(self, url: str) -> Optional[bytes]: 

96 """Download PDF from bioRxiv/medRxiv.""" 

97 # Convert URL to PDF format 

98 pdf_url = self._convert_to_pdf_url(url) 

99 

100 if not pdf_url: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 logger.error(f"Could not convert to PDF URL: {url}") 

102 return None 

103 

104 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

105 return super()._download_pdf(pdf_url) 

106 

107 def _download_text(self, url: str) -> Optional[bytes]: 

108 """Get text content from bioRxiv/medRxiv.""" 

109 # Try to get abstract and metadata from the HTML page 

110 text = self._fetch_abstract_from_page(url) 

111 if text: 

112 return text.encode("utf-8") 

113 

114 # Fallback: Download PDF and extract text 

115 pdf_content = self._download_pdf(url) 

116 if pdf_content: 

117 extracted_text = self.extract_text_from_pdf(pdf_content) 

118 if extracted_text: 

119 return extracted_text.encode("utf-8") 

120 

121 return None 

122 

123 def _convert_to_pdf_url(self, url: str) -> Optional[str]: 

124 """Convert bioRxiv/medRxiv URL to PDF URL.""" 

125 # Handle different URL patterns 

126 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1 

127 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf 

128 

129 # Remove any existing .full or .full.pdf 

130 base_url = re.sub(r"\.full(\.pdf)?$", "", url) 

131 

132 # Check if it's already a PDF URL 

133 if base_url.endswith(".pdf"): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 return base_url 

135 

136 # Add .full.pdf 

137 pdf_url = base_url.rstrip("/") + ".full.pdf" 

138 

139 # Handle content vs. content/early URLs 

140 pdf_url = pdf_url.replace("/content/early/", "/content/") 

141 

142 return pdf_url 

143 

144 def _fetch_abstract_from_page(self, url: str) -> Optional[str]: 

145 """Fetch abstract and metadata from bioRxiv/medRxiv page.""" 

146 try: 

147 # Request the HTML page 

148 response = self.session.get(url, timeout=10) 

149 

150 if response.status_code == 200: 150 ↛ 196line 150 didn't jump to line 196 because the condition on line 150 was always true

151 # Simple extraction using regex (avoiding BeautifulSoup dependency) 

152 html = response.text 

153 text_parts = [] 

154 

155 # Extract title 

156 title_match = re.search( 

157 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html 

158 ) 

159 if title_match: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 text_parts.append(f"Title: {title_match.group(1)}") 

161 

162 # Extract authors 

163 author_match = re.search( 

164 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html 

165 ) 

166 if author_match: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 text_parts.append(f"Authors: {author_match.group(1)}") 

168 

169 # Extract abstract 

170 abstract_match = re.search( 

171 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"', 

172 html, 

173 re.DOTALL, 

174 ) 

175 if abstract_match: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 abstract = abstract_match.group(1) 

177 # Clean up HTML entities 

178 abstract = abstract.replace("&lt;", "<").replace( 

179 "&gt;", ">" 

180 ) 

181 abstract = abstract.replace("&quot;", '"').replace( 

182 "&#39;", "'" 

183 ) 

184 abstract = abstract.replace("&amp;", "&") 

185 text_parts.append(f"\nAbstract:\n{abstract}") 

186 

187 if text_parts: 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 logger.info( 

189 "Retrieved text content from bioRxiv/medRxiv page" 

190 ) 

191 return "\n".join(text_parts) 

192 

193 except Exception as e: 

194 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}") 

195 

196 return None