Coverage for src/local_deep_research/research_library/downloaders/biorxiv.py: 54%

1"""

2bioRxiv/medRxiv PDF and Text Downloader

3"""

5import re

6from typing import Optional

7from urllib.parse import urlparse

9import requests

10from loguru import logger

12from .base import BaseDownloader, ContentType, DownloadResult

15class BioRxivDownloader(BaseDownloader):

16 """Downloader for bioRxiv and medRxiv preprints."""

18 def can_handle(self, url: str) -> bool:

19 """Check if URL is from bioRxiv or medRxiv."""

20 try:

21 hostname = urlparse(url).hostname

22 if not hostname:

23 return False

24 return (

25 hostname == "biorxiv.org"

26 or hostname.endswith(".biorxiv.org")

27 or hostname == "medrxiv.org"

28 or hostname.endswith(".medrxiv.org")

29 )

30 except Exception:

31 return False

33 def download(

34 self, url: str, content_type: ContentType = ContentType.PDF

35 ) -> Optional[bytes]:

36 """Download content from bioRxiv/medRxiv."""

37 if content_type == ContentType.TEXT: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 return self._download_text(url)

39 else:

40 return self._download_pdf(url)

42 def download_with_result(

43 self, url: str, content_type: ContentType = ContentType.PDF

44 ) -> DownloadResult:

45 """Download content and return detailed result with skip reason."""

46 if content_type == ContentType.TEXT: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was never true

47 # Try to get text from page

48 text = self._fetch_abstract_from_page(url)

49 if text:

50 return DownloadResult(

51 content=text.encode("utf-8"), is_success=True

52 )

54 # Fallback to PDF extraction

55 pdf_content = self._download_pdf(url)

56 if pdf_content:

57 extracted_text = self.extract_text_from_pdf(pdf_content)

58 if extracted_text:

59 return DownloadResult(

60 content=extracted_text.encode("utf-8"), is_success=True

61 )

63 return DownloadResult(

64 skip_reason="Could not extract text from bioRxiv/medRxiv article"

65 )

66 else:

67 # Try to download PDF

68 pdf_url = self._convert_to_pdf_url(url)

69 if not pdf_url:

70 return DownloadResult(

71 skip_reason="Invalid bioRxiv/medRxiv URL format"

72 )

74 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")

75 pdf_content = super()._download_pdf(pdf_url)

77 if pdf_content: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 return DownloadResult(content=pdf_content, is_success=True)

79 else:

80 # Check if it's a server issue or article doesn't exist

81 try:

82 response = self.session.head(url, timeout=5)

83 if response.status_code == 404: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 return DownloadResult(

85 skip_reason="Article not found on bioRxiv/medRxiv"

86 )

87 elif response.status_code >= 500: 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 return DownloadResult(

89 skip_reason="bioRxiv/medRxiv server temporarily unavailable"

90 )

91 except requests.RequestException:

92 logger.debug("Failed to check bioRxiv/medRxiv URL: %s", url)

93 return DownloadResult(

94 skip_reason="Failed to download PDF from bioRxiv/medRxiv"

95 )

97 def _download_pdf(self, url: str) -> Optional[bytes]:

98 """Download PDF from bioRxiv/medRxiv."""

99 # Convert URL to PDF format

100 pdf_url = self._convert_to_pdf_url(url)

101

102 if not pdf_url: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 logger.error(f"Could not convert to PDF URL: {url}")

104 return None

105

106 logger.info(f"Downloading bioRxiv/medRxiv PDF from {pdf_url}")

107 return super()._download_pdf(pdf_url)

108

109 def _download_text(self, url: str) -> Optional[bytes]:

110 """Get text content from bioRxiv/medRxiv."""

111 # Try to get abstract and metadata from the HTML page

112 text = self._fetch_abstract_from_page(url)

113 if text:

114 return text.encode("utf-8")

115

116 # Fallback: Download PDF and extract text

117 pdf_content = self._download_pdf(url)

118 if pdf_content:

119 extracted_text = self.extract_text_from_pdf(pdf_content)

120 if extracted_text:

121 return extracted_text.encode("utf-8")

122

123 return None

124

125 def _convert_to_pdf_url(self, url: str) -> Optional[str]:

126 """Convert bioRxiv/medRxiv URL to PDF URL."""

127 # Handle different URL patterns

128 # Example: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1

129 # Becomes: https://www.biorxiv.org/content/10.1101/2024.01.01.123456v1.full.pdf

130

131 # Remove any existing .full or .full.pdf

132 base_url = re.sub(r"\.full(\.pdf)?$", "", url)

133

134 # Check if it's already a PDF URL

135 if base_url.endswith(".pdf"): 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 return base_url

137

138 # Add .full.pdf

139 pdf_url = base_url.rstrip("/") + ".full.pdf"

140

141 # Handle content vs. content/early URLs

142 pdf_url = pdf_url.replace("/content/early/", "/content/")

143

144 return pdf_url

145

146 def _fetch_abstract_from_page(self, url: str) -> Optional[str]:

147 """Fetch abstract and metadata from bioRxiv/medRxiv page."""

148 try:

149 # Request the HTML page

150 response = self.session.get(url, timeout=10)

151

152 if response.status_code == 200: 152 ↛ 198line 152 didn't jump to line 198 because the condition on line 152 was always true

153 # Simple extraction using regex (avoiding BeautifulSoup dependency)

154 html = response.text

155 text_parts = []

156

157 # Extract title

158 title_match = re.search(

159 r'<meta\s+name="DC\.Title"\s+content="([^"]+)"', html

160 )

161 if title_match: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 text_parts.append(f"Title: {title_match.group(1)}")

163

164 # Extract authors

165 author_match = re.search(

166 r'<meta\s+name="DC\.Creator"\s+content="([^"]+)"', html

167 )

168 if author_match: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 text_parts.append(f"Authors: {author_match.group(1)}")

170

171 # Extract abstract

172 abstract_match = re.search(

173 r'<meta\s+name="DC\.Description"\s+content="([^"]+)"',

174 html,

175 re.DOTALL,

176 )

177 if abstract_match: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 abstract = abstract_match.group(1)

179 # Clean up HTML entities

180 abstract = abstract.replace("<", "<").replace(

181 ">", ">"

182 )

183 abstract = abstract.replace(""", '"').replace(

184 "'", "'"

185 )

186 abstract = abstract.replace("&", "&")

187 text_parts.append(f"\nAbstract:\n{abstract}")

188

189 if text_parts: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 logger.info(

191 "Retrieved text content from bioRxiv/medRxiv page"

192 )

193 return "\n".join(text_parts)

194

195 except Exception as e:

196 logger.debug(f"Failed to fetch abstract from bioRxiv/medRxiv: {e}")

197

198 return None

Coverage for src / local_deep_research / research_library / downloaders / biorxiv.py: 54%

95 statements