Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 54%

95 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2bioRxiv/medRxiv PDF and Text Downloader 

3""" 

4 

5import re 

6from typing import Optional 

7from urllib.parse import urlparse 

8 

9import requests 

10from loguru import logger 

11 

12from .base import BaseDownloader, ContentType, DownloadResult 

13 

14 

15class BioRxivDownloader(BaseDownloader): 

16 """Downloader for bioRxiv and medRxiv preprints.""" 

17 

18 def can_handle(self, url: str) -> bool: 

19 """Check if URL is from bioRxiv or medRxiv.""" 

20 try: 

21 hostname = urlparse(url).hostname 

22 if not hostname: 

23 return False 

24 return ( 

25 hostname == "biorxiv.org" 

26 or hostname.endswith(".biorxiv.org") 

27 or hostname == "medrxiv.org" 

28 or hostname.endswith(".medrxiv.org") 

29 ) 

30 except Exception: 

31 return False 

32 

33 def download( 

34 self, url: str, content_type: ContentType = ContentType.PDF 

35 ) -> Optional[bytes]: 

36 """Download content from bioRxiv/medRxiv.""" 

37 if content_type == ContentType.TEXT: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 return self._download_text(url) 

39 else: 

40 return self._download_pdf(url) 

41 

42 def download_with_result( 

43 self, url: str, content_type: ContentType = ContentType.PDF 

44 ) -> DownloadResult: 

45 """Download content and return detailed result with skip reason.""" 

46 if content_type == ContentType.TEXT: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was never true

47 # Try to get text from page 

48 text = self._fetch_abstract_from_page(url) 

49 if text: 

50 return DownloadResult( 

51 content=text.encode("utf-8"), is_success=True 

52 ) 

53 

54 # Fallback to PDF extraction 

55 pdf_content = self._download_pdf(url) 

56 if pdf_content: 

57 extracted_text = self.extract_text_from_pdf(pdf_content) 

58 if extracted_text: 

59 return DownloadResult( 

60 content=extracted_text.encode("utf-8"), is_success=True 

61 ) 

62 

63 return DownloadResult( 

64 skip_reason="Could not extract text from bioRxiv/medRxiv article" 

65 ) 

66 else: 

67 # Try to download PDF 

68 pdf_url = self._convert_to_pdf_url(url) 

69 if not pdf_url: 

70 return DownloadResult( 

71 skip_reason="Invalid bioRxiv/medRxiv URL format" 

72 ) 

73 

74 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

75 pdf_content = super()._download_pdf(pdf_url) 

76 

77 if pdf_content: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 return DownloadResult(content=pdf_content, is_success=True) 

79 else: 

80 # Check if it's a server issue or article doesn't exist 

81 try: 

82 response = self.session.head(url, timeout=5) 

83 if response.status_code == 404: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 return DownloadResult( 

85 skip_reason="Article not found on bioRxiv/medRxiv" 

86 ) 

87 elif response.status_code >= 500: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 return DownloadResult( 

89 skip_reason="bioRxiv/medRxiv server temporarily unavailable" 

90 ) 

91 except requests.RequestException: 

92 logger.debug("Failed to check bioRxiv/medRxiv URL: %s", url) 

93 return DownloadResult( 

94 skip_reason="Failed to download PDF from bioRxiv/medRxiv" 

95 ) 

96 

97 def _download_pdf(self, url: str) -> Optional[bytes]: 

98 """Download PDF from bioRxiv/medRxiv.""" 

99 # Convert URL to PDF format 

100 pdf_url = self._convert_to_pdf_url(url) 

101 

102 if not pdf_url: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 logger.error(f"Could not convert to PDF URL: {url}") 

104 return None 

105 

106 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}") 

107 return super()._download_pdf(pdf_url) 

108 

109 def _download_text(self, url: str) -> Optional[bytes]: 

110 """Get text content from bioRxiv/medRxiv.""" 

111 # Try to get abstract and metadata from the HTML page 

112 text = self._fetch_abstract_from_page(url) 

113 if text: 

114 return text.encode("utf-8") 

115 

116 # Fallback: Download PDF and extract text 

117 pdf_content = self._download_pdf(url) 

118 if pdf_content: 

119 extracted_text = self.extract_text_from_pdf(pdf_content) 

120 if extracted_text: 

121 return extracted_text.encode("utf-8") 

122 

123 return None 

124 

125 def _convert_to_pdf_url(self, url: str) -> Optional[str]: 

126 """Convert bioRxiv/medRxiv URL to PDF URL.""" 

127 # Handle different URL patterns 

128 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1 

129 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf 

130 

131 # Remove any existing .full or .full.pdf 

132 base_url = re.sub(r"\.full(\.pdf)?$", "", url) 

133 

134 # Check if it's already a PDF URL 

135 if base_url.endswith(".pdf"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 return base_url 

137 

138 # Add .full.pdf 

139 pdf_url = base_url.rstrip("/") + ".full.pdf" 

140 

141 # Handle content vs. content/early URLs 

142 pdf_url = pdf_url.replace("/content/early/", "/content/") 

143 

144 return pdf_url 

145 

146 def _fetch_abstract_from_page(self, url: str) -> Optional[str]: 

147 """Fetch abstract and metadata from bioRxiv/medRxiv page.""" 

148 try: 

149 # Request the HTML page 

150 response = self.session.get(url, timeout=10) 

151 

152 if response.status_code == 200: 152 ↛ 198line 152 didn't jump to line 198 because the condition on line 152 was always true

153 # Simple extraction using regex (avoiding BeautifulSoup dependency) 

154 html = response.text 

155 text_parts = [] 

156 

157 # Extract title 

158 title_match = re.search( 

159 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html 

160 ) 

161 if title_match: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 text_parts.append(f"Title: {title_match.group(1)}") 

163 

164 # Extract authors 

165 author_match = re.search( 

166 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html 

167 ) 

168 if author_match: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 text_parts.append(f"Authors: {author_match.group(1)}") 

170 

171 # Extract abstract 

172 abstract_match = re.search( 

173 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"', 

174 html, 

175 re.DOTALL, 

176 ) 

177 if abstract_match: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 abstract = abstract_match.group(1) 

179 # Clean up HTML entities 

180 abstract = abstract.replace("&lt;", "<").replace( 

181 "&gt;", ">" 

182 ) 

183 abstract = abstract.replace("&quot;", '"').replace( 

184 "&#39;", "'" 

185 ) 

186 abstract = abstract.replace("&amp;", "&") 

187 text_parts.append(f"\nAbstract:\n{abstract}") 

188 

189 if text_parts: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 logger.info( 

191 "Retrieved text content from bioRxiv/medRxiv page" 

192 ) 

193 return "\n".join(text_parts) 

194 

195 except Exception as e: 

196 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}") 

197 

198 return None