Coverage for src / local_deep_research / research_library / downloaders / arxiv.py: 69%

107 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2arXiv PDF and Text Downloader 

3""" 

4 

5import re 

6from typing import Optional 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult, USER_AGENT 

11 

12 

13class ArxivDownloader(BaseDownloader): 

14 """Downloader for arXiv papers with PDF and abstract/text support.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Check if URL is from arXiv.""" 

18 try: 

19 hostname = urlparse(url).hostname 

20 return bool( 

21 hostname 

22 and (hostname == "arxiv.org" or hostname.endswith(".arxiv.org")) 

23 ) 

24 except Exception: 

25 return False 

26 

27 def download( 

28 self, url: str, content_type: ContentType = ContentType.PDF 

29 ) -> Optional[bytes]: 

30 """Download content from arXiv.""" 

31 if content_type == ContentType.TEXT: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 return self._download_text(url) 

33 else: 

34 return self._download_pdf(url) 

35 

36 def download_with_result( 

37 self, url: str, content_type: ContentType = ContentType.PDF 

38 ) -> DownloadResult: 

39 """Download content and return detailed result with skip reason.""" 

40 # Extract arXiv ID 

41 arxiv_id = self._extract_arxiv_id(url) 

42 if not arxiv_id: 

43 return DownloadResult( 

44 skip_reason="Invalid arXiv URL - could not extract article ID" 

45 ) 

46 

47 if content_type == ContentType.TEXT: 

48 # ArXiv API only provides abstracts, not full text 

49 # We need to download the PDF and extract full text 

50 logger.info( 

51 f"Downloading arXiv PDF for full text extraction: {arxiv_id}" 

52 ) 

53 

54 pdf_content = self._download_pdf(url) 

55 if pdf_content: 55 ↛ 76line 55 didn't jump to line 76 because the condition on line 55 was always true

56 extracted_text = self.extract_text_from_pdf(pdf_content) 

57 if extracted_text: 57 ↛ 59line 57 didn't jump to line 59 because the condition on line 57 was never true

58 # Optionally prepend metadata from API 

59 metadata = self._fetch_from_arxiv_api(arxiv_id) 

60 if metadata: 

61 # Combine metadata with full text 

62 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}" 

63 return DownloadResult( 

64 content=full_text.encode("utf-8", errors="ignore"), 

65 is_success=True, 

66 ) 

67 else: 

68 # Just return the extracted text 

69 return DownloadResult( 

70 content=extracted_text.encode( 

71 "utf-8", errors="ignore" 

72 ), 

73 is_success=True, 

74 ) 

75 

76 return DownloadResult( 

77 skip_reason=f"Could not retrieve full text for arXiv:{arxiv_id}" 

78 ) 

79 else: 

80 # Download PDF 

81 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 

82 logger.info(f"Downloading arXiv PDF: {arxiv_id}") 

83 

84 pdf_content = super()._download_pdf(pdf_url) 

85 if pdf_content: 

86 return DownloadResult(content=pdf_content, is_success=True) 

87 else: 

88 return DownloadResult( 

89 skip_reason=f"Failed to download PDF for arXiv:{arxiv_id} - server may be unavailable" 

90 ) 

91 

92 def _download_pdf(self, url: str) -> Optional[bytes]: 

93 """Download PDF from arXiv.""" 

94 # Extract arXiv ID 

95 arxiv_id = self._extract_arxiv_id(url) 

96 if not arxiv_id: 

97 logger.error(f"Could not extract arXiv ID from {url}") 

98 return None 

99 

100 # Construct PDF URL 

101 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 

102 

103 logger.info(f"Downloading arXiv PDF: {arxiv_id}") 

104 

105 # Use honest user agent - arXiv supports academic tools with proper identification 

106 enhanced_headers = { 

107 "User-Agent": USER_AGENT, 

108 "Accept": "application/pdf,application/octet-stream,*/*", 

109 "Accept-Language": "en-US,en;q=0.9", 

110 "Accept-Encoding": "gzip, deflate, br", 

111 "Connection": "keep-alive", 

112 } 

113 

114 return super()._download_pdf(pdf_url, headers=enhanced_headers) 

115 

116 def _download_text(self, url: str) -> Optional[bytes]: 

117 """Get full text content from arXiv PDF (with metadata from API).""" 

118 # Extract arXiv ID 

119 arxiv_id = self._extract_arxiv_id(url) 

120 if not arxiv_id: 

121 return None 

122 

123 # Download PDF for full text extraction 

124 logger.info(f"Downloading arXiv PDF for full text: {arxiv_id}") 

125 pdf_content = self._download_pdf(url) 

126 if pdf_content: 

127 extracted_text = self.extract_text_from_pdf(pdf_content) 

128 if extracted_text: 

129 # Get metadata from API to prepend 

130 metadata = self._fetch_from_arxiv_api(arxiv_id) 

131 if metadata: 

132 # Combine metadata with full text 

133 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}" 

134 return full_text.encode("utf-8", errors="ignore") 

135 else: 

136 return extracted_text.encode("utf-8", errors="ignore") 

137 

138 return None 

139 

140 def _extract_arxiv_id(self, url: str) -> Optional[str]: 

141 """Extract arXiv ID from URL.""" 

142 # Handle different arXiv URL formats 

143 patterns = [ 

144 r"arxiv\.org/abs/(\d+\.\d+)(?:v\d+)?", # New format: 2301.12345 or 2301.12345v2 

145 r"arxiv\.org/pdf/(\d+\.\d+)(?:v\d+)?", # PDF URL with optional version 

146 r"arxiv\.org/abs/([a-z-]+/\d+)(?:v\d+)?", # Old format: cond-mat/0501234 

147 r"arxiv\.org/pdf/([a-z-]+/\d+)(?:v\d+)?", # Old PDF URL with optional version 

148 ] 

149 

150 for pattern in patterns: 

151 match = re.search(pattern, url) 

152 if match: 

153 return match.group(1) 

154 

155 return None 

156 

157 def _fetch_from_arxiv_api(self, arxiv_id: str) -> Optional[str]: 

158 """Fetch abstract and metadata from arXiv API.""" 

159 try: 

160 # Clean the ID for API query 

161 clean_id = arxiv_id.replace("/", "") 

162 

163 # Query arXiv API 

164 api_url = f"https://export.arxiv.org/api/query?id_list={clean_id}" 

165 response = self.session.get(api_url, timeout=10) 

166 

167 if response.status_code == 200: 

168 # Parse the Atom feed response 

169 import xml.etree.ElementTree as ET 

170 

171 root = ET.fromstring(response.text) 

172 

173 # Define namespaces (URIs are identifiers, not URLs to fetch) 

174 ns = { 

175 "atom": "http://www.w3.org/2005/Atom", # DevSkim: ignore DS137138 

176 "arxiv": "http://arxiv.org/schemas/atom", # DevSkim: ignore DS137138 

177 } 

178 

179 # Find the entry 

180 entry = root.find("atom:entry", ns) 

181 if entry is not None: 181 ↛ 232line 181 didn't jump to line 232 because the condition on line 181 was always true

182 # Extract text content 

183 text_parts = [] 

184 

185 # Title 

186 title = entry.find("atom:title", ns) 

187 if title is not None and title.text: 187 ↛ 191line 187 didn't jump to line 191 because the condition on line 187 was always true

188 text_parts.append(f"Title: {title.text.strip()}") 

189 

190 # Authors 

191 authors = entry.findall("atom:author", ns) 

192 if authors: 192 ↛ 204line 192 didn't jump to line 204 because the condition on line 192 was always true

193 author_names = [] 

194 for author in authors: 

195 name = author.find("atom:name", ns) 

196 if name is not None and name.text: 196 ↛ 194line 196 didn't jump to line 194 because the condition on line 196 was always true

197 author_names.append(name.text.strip()) 

198 if author_names: 198 ↛ 204line 198 didn't jump to line 204 because the condition on line 198 was always true

199 text_parts.append( 

200 f"Authors: {', '.join(author_names)}" 

201 ) 

202 

203 # Abstract 

204 summary = entry.find("atom:summary", ns) 

205 if summary is not None and summary.text: 205 ↛ 211line 205 didn't jump to line 211 because the condition on line 205 was always true

206 text_parts.append( 

207 f"\nAbstract:\n{summary.text.strip()}" 

208 ) 

209 

210 # Categories 

211 categories = entry.findall("atom:category", ns) 

212 if categories: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true

213 cat_terms = [ 

214 cat.get("term") 

215 for cat in categories 

216 if cat.get("term") 

217 ] 

218 if cat_terms: 

219 text_parts.append( 

220 f"\nCategories: {', '.join(cat_terms)}" 

221 ) 

222 

223 if text_parts: 223 ↛ 232line 223 didn't jump to line 232 because the condition on line 223 was always true

224 logger.info( 

225 f"Retrieved text content from arXiv API for {arxiv_id}" 

226 ) 

227 return "\n".join(text_parts) 

228 

229 except Exception as e: 

230 logger.debug(f"Failed to fetch from arXiv API: {e}") 

231 

232 return None