Coverage for src / local_deep_research / research_library / downloaders / arxiv.py: 96%

107 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2arXiv PDF and Text Downloader 

3""" 

4 

5import re 

6from typing import Dict, Optional 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult, USER_AGENT 

11 

12 

13class ArxivDownloader(BaseDownloader): 

14 """Downloader for arXiv papers with PDF and abstract/text support.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Check if URL is from arXiv.""" 

18 try: 

19 hostname = urlparse(url).hostname 

20 return bool( 

21 hostname 

22 and (hostname == "arxiv.org" or hostname.endswith(".arxiv.org")) 

23 ) 

24 except Exception: 

25 return False 

26 

27 def download( 

28 self, url: str, content_type: ContentType = ContentType.PDF 

29 ) -> Optional[bytes]: 

30 """Download content from arXiv.""" 

31 if content_type == ContentType.TEXT: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 return self._download_text(url) 

33 return self._download_pdf(url) 

34 

35 def download_with_result( 

36 self, url: str, content_type: ContentType = ContentType.PDF 

37 ) -> DownloadResult: 

38 """Download content and return detailed result with skip reason.""" 

39 # Extract arXiv ID 

40 arxiv_id = self._extract_arxiv_id(url) 

41 if not arxiv_id: 

42 return DownloadResult( 

43 skip_reason="Invalid arXiv URL - could not extract article ID" 

44 ) 

45 

46 if content_type == ContentType.TEXT: 

47 # ArXiv API only provides abstracts, not full text 

48 # We need to download the PDF and extract full text 

49 logger.info( 

50 f"Downloading arXiv PDF for full text extraction: {arxiv_id}" 

51 ) 

52 

53 pdf_content = self._download_pdf(url) 

54 if pdf_content: 

55 extracted_text = self.extract_text_from_pdf(pdf_content) 

56 if extracted_text: 

57 # Optionally prepend metadata from API 

58 metadata = self._fetch_from_arxiv_api(arxiv_id) 

59 if metadata: 

60 # Combine metadata with full text 

61 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}" 

62 return DownloadResult( 

63 content=full_text.encode("utf-8", errors="ignore"), 

64 is_success=True, 

65 ) 

66 # Just return the extracted text 

67 return DownloadResult( 

68 content=extracted_text.encode("utf-8", errors="ignore"), 

69 is_success=True, 

70 ) 

71 

72 return DownloadResult( 

73 skip_reason=f"Could not retrieve full text for arXiv:{arxiv_id}" 

74 ) 

75 # Download PDF 

76 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 

77 logger.info(f"Downloading arXiv PDF: {arxiv_id}") 

78 

79 pdf_content = super()._download_pdf(pdf_url) 

80 if pdf_content: 

81 return DownloadResult(content=pdf_content, is_success=True) 

82 return DownloadResult( 

83 skip_reason=f"Failed to download PDF for arXiv:{arxiv_id} - server may be unavailable" 

84 ) 

85 

86 def _download_pdf( 

87 self, url: str, headers: Optional[Dict[str, str]] = None 

88 ) -> Optional[bytes]: 

89 """Download PDF from arXiv.""" 

90 # Extract arXiv ID 

91 arxiv_id = self._extract_arxiv_id(url) 

92 if not arxiv_id: 

93 logger.error(f"Could not extract arXiv ID from {url}") 

94 return None 

95 

96 # Construct PDF URL 

97 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 

98 

99 logger.info(f"Downloading arXiv PDF: {arxiv_id}") 

100 

101 # Use honest user agent - arXiv supports academic tools with proper identification 

102 enhanced_headers = { 

103 "User-Agent": USER_AGENT, 

104 "Accept": "application/pdf,application/octet-stream,*/*", 

105 "Accept-Language": "en-US,en;q=0.9", 

106 "Accept-Encoding": "gzip, deflate, br", 

107 "Connection": "keep-alive", 

108 } 

109 

110 return super()._download_pdf(pdf_url, headers=enhanced_headers) 

111 

112 def _download_text(self, url: str) -> Optional[bytes]: 

113 """Get full text content from arXiv PDF (with metadata from API).""" 

114 # Extract arXiv ID 

115 arxiv_id = self._extract_arxiv_id(url) 

116 if not arxiv_id: 

117 return None 

118 

119 # Download PDF for full text extraction 

120 logger.info(f"Downloading arXiv PDF for full text: {arxiv_id}") 

121 pdf_content = self._download_pdf(url) 

122 if pdf_content: 

123 extracted_text = self.extract_text_from_pdf(pdf_content) 

124 if extracted_text: 

125 # Get metadata from API to prepend 

126 metadata = self._fetch_from_arxiv_api(arxiv_id) 

127 if metadata: 

128 # Combine metadata with full text 

129 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}" 

130 return full_text.encode("utf-8", errors="ignore") 

131 return extracted_text.encode("utf-8", errors="ignore") 

132 

133 return None 

134 

135 def _extract_arxiv_id(self, url: str) -> Optional[str]: 

136 """Extract arXiv ID from URL.""" 

137 # Handle different arXiv URL formats 

138 patterns = [ 

139 r"arxiv\.org/abs/(\d+\.\d+)(?:v\d+)?", # New format: 2301.12345 or 2301.12345v2 

140 r"arxiv\.org/pdf/(\d+\.\d+)(?:v\d+)?", # PDF URL with optional version 

141 r"arxiv\.org/abs/([a-z-]+/\d+)(?:v\d+)?", # Old format: cond-mat/0501234 

142 r"arxiv\.org/pdf/([a-z-]+/\d+)(?:v\d+)?", # Old PDF URL with optional version 

143 ] 

144 

145 for pattern in patterns: 

146 match = re.search(pattern, url) 

147 if match: 

148 return match.group(1) 

149 

150 return None 

151 

152 def _fetch_from_arxiv_api(self, arxiv_id: str) -> Optional[str]: 

153 """Fetch abstract and metadata from arXiv API.""" 

154 try: 

155 # Clean the ID for API query 

156 clean_id = arxiv_id.replace("/", "") 

157 

158 # Query arXiv API 

159 api_url = f"https://export.arxiv.org/api/query?id_list={clean_id}" 

160 response = self.session.get(api_url, timeout=10) 

161 

162 if response.status_code == 200: 

163 # Parse the Atom feed response 

164 # Use defusedxml to prevent XXE attacks 

165 from defusedxml import ElementTree as ET 

166 

167 root = ET.fromstring(response.text) 

168 

169 # Define namespaces (URIs are identifiers, not URLs to fetch) 

170 ns = { 

171 "atom": "http://www.w3.org/2005/Atom", # DevSkim: ignore DS137138 

172 "arxiv": "http://arxiv.org/schemas/atom", # DevSkim: ignore DS137138 

173 } 

174 

175 # Find the entry 

176 entry = root.find("atom:entry", ns) 

177 if entry is not None: 

178 # Extract text content 

179 text_parts = [] 

180 

181 # Title 

182 title = entry.find("atom:title", ns) 

183 if title is not None and title.text: 

184 text_parts.append(f"Title: {title.text.strip()}") 

185 

186 # Authors 

187 authors = entry.findall("atom:author", ns) 

188 if authors: 

189 author_names = [] 

190 for author in authors: 

191 name = author.find("atom:name", ns) 

192 if name is not None and name.text: 192 ↛ 190line 192 didn't jump to line 190 because the condition on line 192 was always true

193 author_names.append(name.text.strip()) 

194 if author_names: 194 ↛ 200line 194 didn't jump to line 200 because the condition on line 194 was always true

195 text_parts.append( 

196 f"Authors: {', '.join(author_names)}" 

197 ) 

198 

199 # Abstract 

200 summary = entry.find("atom:summary", ns) 

201 if summary is not None and summary.text: 

202 text_parts.append( 

203 f"\nAbstract:\n{summary.text.strip()}" 

204 ) 

205 

206 # Categories 

207 categories = entry.findall("atom:category", ns) 

208 if categories: 

209 cat_terms = [ 

210 cat.get("term") 

211 for cat in categories 

212 if cat.get("term") 

213 ] 

214 if cat_terms: 214 ↛ 219line 214 didn't jump to line 219 because the condition on line 214 was always true

215 text_parts.append( 

216 f"\nCategories: {', '.join(cat_terms)}" 

217 ) 

218 

219 if text_parts: 

220 logger.info( 

221 f"Retrieved text content from arXiv API for {arxiv_id}" 

222 ) 

223 return "\n".join(text_parts) 

224 

225 except Exception as e: 

226 logger.debug(f"Failed to fetch from arXiv API: {e}") 

227 

228 return None