Coverage for src/local_deep_research/research_library/downloaders/arxiv.py: 69%

1"""

2arXiv PDF and Text Downloader

3"""

5import re

6from typing import Optional

7from urllib.parse import urlparse

8from loguru import logger

10from .base import BaseDownloader, ContentType, DownloadResult, USER_AGENT

13class ArxivDownloader(BaseDownloader):

14 """Downloader for arXiv papers with PDF and abstract/text support."""

16 def can_handle(self, url: str) -> bool:

17 """Check if URL is from arXiv."""

18 try:

19 hostname = urlparse(url).hostname

20 return bool(

21 hostname

22 and (hostname == "arxiv.org" or hostname.endswith(".arxiv.org"))

23 )

24 except Exception:

25 return False

27 def download(

28 self, url: str, content_type: ContentType = ContentType.PDF

29 ) -> Optional[bytes]:

30 """Download content from arXiv."""

31 if content_type == ContentType.TEXT: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 return self._download_text(url)

33 else:

34 return self._download_pdf(url)

36 def download_with_result(

37 self, url: str, content_type: ContentType = ContentType.PDF

38 ) -> DownloadResult:

39 """Download content and return detailed result with skip reason."""

40 # Extract arXiv ID

41 arxiv_id = self._extract_arxiv_id(url)

42 if not arxiv_id:

43 return DownloadResult(

44 skip_reason="Invalid arXiv URL - could not extract article ID"

45 )

47 if content_type == ContentType.TEXT:

48 # ArXiv API only provides abstracts, not full text

49 # We need to download the PDF and extract full text

50 logger.info(

51 f"Downloading arXiv PDF for full text extraction: {arxiv_id}"

52 )

54 pdf_content = self._download_pdf(url)

55 if pdf_content: 55 ↛ 76line 55 didn't jump to line 76 because the condition on line 55 was always true

56 extracted_text = self.extract_text_from_pdf(pdf_content)

57 if extracted_text: 57 ↛ 59line 57 didn't jump to line 59 because the condition on line 57 was never true

58 # Optionally prepend metadata from API

59 metadata = self._fetch_from_arxiv_api(arxiv_id)

60 if metadata:

61 # Combine metadata with full text

62 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"

63 return DownloadResult(

64 content=full_text.encode("utf-8", errors="ignore"),

65 is_success=True,

66 )

67 else:

68 # Just return the extracted text

69 return DownloadResult(

70 content=extracted_text.encode(

71 "utf-8", errors="ignore"

72 ),

73 is_success=True,

74 )

76 return DownloadResult(

77 skip_reason=f"Could not retrieve full text for arXiv:{arxiv_id}"

78 )

79 else:

80 # Download PDF

81 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

82 logger.info(f"Downloading arXiv PDF: {arxiv_id}")

84 pdf_content = super()._download_pdf(pdf_url)

85 if pdf_content:

86 return DownloadResult(content=pdf_content, is_success=True)

87 else:

88 return DownloadResult(

89 skip_reason=f"Failed to download PDF for arXiv:{arxiv_id} - server may be unavailable"

90 )

92 def _download_pdf(self, url: str) -> Optional[bytes]:

93 """Download PDF from arXiv."""

94 # Extract arXiv ID

95 arxiv_id = self._extract_arxiv_id(url)

96 if not arxiv_id:

97 logger.error(f"Could not extract arXiv ID from {url}")

98 return None

100 # Construct PDF URL

101 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"

102

103 logger.info(f"Downloading arXiv PDF: {arxiv_id}")

104

105 # Use honest user agent - arXiv supports academic tools with proper identification

106 enhanced_headers = {

107 "User-Agent": USER_AGENT,

108 "Accept": "application/pdf,application/octet-stream,*/*",

109 "Accept-Language": "en-US,en;q=0.9",

110 "Accept-Encoding": "gzip, deflate, br",

111 "Connection": "keep-alive",

112 }

113

114 return super()._download_pdf(pdf_url, headers=enhanced_headers)

115

116 def _download_text(self, url: str) -> Optional[bytes]:

117 """Get full text content from arXiv PDF (with metadata from API)."""

118 # Extract arXiv ID

119 arxiv_id = self._extract_arxiv_id(url)

120 if not arxiv_id:

121 return None

122

123 # Download PDF for full text extraction

124 logger.info(f"Downloading arXiv PDF for full text: {arxiv_id}")

125 pdf_content = self._download_pdf(url)

126 if pdf_content:

127 extracted_text = self.extract_text_from_pdf(pdf_content)

128 if extracted_text:

129 # Get metadata from API to prepend

130 metadata = self._fetch_from_arxiv_api(arxiv_id)

131 if metadata:

132 # Combine metadata with full text

133 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"

134 return full_text.encode("utf-8", errors="ignore")

135 else:

136 return extracted_text.encode("utf-8", errors="ignore")

137

138 return None

139

140 def _extract_arxiv_id(self, url: str) -> Optional[str]:

141 """Extract arXiv ID from URL."""

142 # Handle different arXiv URL formats

143 patterns = [

144 r"arxiv\.org/abs/(\d+\.\d+)(?:v\d+)?", # New format: 2301.12345 or 2301.12345v2

145 r"arxiv\.org/pdf/(\d+\.\d+)(?:v\d+)?", # PDF URL with optional version

146 r"arxiv\.org/abs/([a-z-]+/\d+)(?:v\d+)?", # Old format: cond-mat/0501234

147 r"arxiv\.org/pdf/([a-z-]+/\d+)(?:v\d+)?", # Old PDF URL with optional version

148 ]

149

150 for pattern in patterns:

151 match = re.search(pattern, url)

152 if match:

153 return match.group(1)

154

155 return None

156

157 def _fetch_from_arxiv_api(self, arxiv_id: str) -> Optional[str]:

158 """Fetch abstract and metadata from arXiv API."""

159 try:

160 # Clean the ID for API query

161 clean_id = arxiv_id.replace("/", "")

162

163 # Query arXiv API

164 api_url = f"https://export.arxiv.org/api/query?id_list={clean_id}"

165 response = self.session.get(api_url, timeout=10)

166

167 if response.status_code == 200:

168 # Parse the Atom feed response

169 # Use defusedxml to prevent XXE attacks

170 from defusedxml import ElementTree as ET

171

172 root = ET.fromstring(response.text)

173

174 # Define namespaces (URIs are identifiers, not URLs to fetch)

175 ns = {

176 "atom": "http://www.w3.org/2005/Atom", # DevSkim: ignore DS137138

177 "arxiv": "http://arxiv.org/schemas/atom", # DevSkim: ignore DS137138

178 }

179

180 # Find the entry

181 entry = root.find("atom:entry", ns)

182 if entry is not None: 182 ↛ 233line 182 didn't jump to line 233 because the condition on line 182 was always true

183 # Extract text content

184 text_parts = []

185

186 # Title

187 title = entry.find("atom:title", ns)

188 if title is not None and title.text: 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was always true

189 text_parts.append(f"Title: {title.text.strip()}")

190

191 # Authors

192 authors = entry.findall("atom:author", ns)

193 if authors: 193 ↛ 205line 193 didn't jump to line 205 because the condition on line 193 was always true

194 author_names = []

195 for author in authors:

196 name = author.find("atom:name", ns)

197 if name is not None and name.text: 197 ↛ 195line 197 didn't jump to line 195 because the condition on line 197 was always true

198 author_names.append(name.text.strip())

199 if author_names: 199 ↛ 205line 199 didn't jump to line 205 because the condition on line 199 was always true

200 text_parts.append(

201 f"Authors: {', '.join(author_names)}"

202 )

203

204 # Abstract

205 summary = entry.find("atom:summary", ns)

206 if summary is not None and summary.text: 206 ↛ 212line 206 didn't jump to line 212 because the condition on line 206 was always true

207 text_parts.append(

208 f"\nAbstract:\n{summary.text.strip()}"

209 )

210

211 # Categories

212 categories = entry.findall("atom:category", ns)

213 if categories: 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 cat_terms = [

215 cat.get("term")

216 for cat in categories

217 if cat.get("term")

218 ]

219 if cat_terms:

220 text_parts.append(

221 f"\nCategories: {', '.join(cat_terms)}"

222 )

223

224 if text_parts: 224 ↛ 233line 224 didn't jump to line 233 because the condition on line 224 was always true

225 logger.info(

226 f"Retrieved text content from arXiv API for {arxiv_id}"

227 )

228 return "\n".join(text_parts)

229

230 except Exception as e:

231 logger.debug(f"Failed to fetch from arXiv API: {e}")

232

233 return None

Coverage for src / local_deep_research / research_library / downloaders / arxiv.py: 69%

107 statements