Coverage for src / local_deep_research / research_library / downloaders / arxiv.py: 69%
107 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2arXiv PDF and Text Downloader
3"""
5import re
6from typing import Optional
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult, USER_AGENT
13class ArxivDownloader(BaseDownloader):
14 """Downloader for arXiv papers with PDF and abstract/text support."""
16 def can_handle(self, url: str) -> bool:
17 """Check if URL is from arXiv."""
18 try:
19 hostname = urlparse(url).hostname
20 return bool(
21 hostname
22 and (hostname == "arxiv.org" or hostname.endswith(".arxiv.org"))
23 )
24 except Exception:
25 return False
27 def download(
28 self, url: str, content_type: ContentType = ContentType.PDF
29 ) -> Optional[bytes]:
30 """Download content from arXiv."""
31 if content_type == ContentType.TEXT: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true
32 return self._download_text(url)
33 else:
34 return self._download_pdf(url)
36 def download_with_result(
37 self, url: str, content_type: ContentType = ContentType.PDF
38 ) -> DownloadResult:
39 """Download content and return detailed result with skip reason."""
40 # Extract arXiv ID
41 arxiv_id = self._extract_arxiv_id(url)
42 if not arxiv_id:
43 return DownloadResult(
44 skip_reason="Invalid arXiv URL - could not extract article ID"
45 )
47 if content_type == ContentType.TEXT:
48 # ArXiv API only provides abstracts, not full text
49 # We need to download the PDF and extract full text
50 logger.info(
51 f"Downloading arXiv PDF for full text extraction: {arxiv_id}"
52 )
54 pdf_content = self._download_pdf(url)
55 if pdf_content: 55 ↛ 76line 55 didn't jump to line 76 because the condition on line 55 was always true
56 extracted_text = self.extract_text_from_pdf(pdf_content)
57 if extracted_text: 57 ↛ 59line 57 didn't jump to line 59 because the condition on line 57 was never true
58 # Optionally prepend metadata from API
59 metadata = self._fetch_from_arxiv_api(arxiv_id)
60 if metadata:
61 # Combine metadata with full text
62 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"
63 return DownloadResult(
64 content=full_text.encode("utf-8", errors="ignore"),
65 is_success=True,
66 )
67 else:
68 # Just return the extracted text
69 return DownloadResult(
70 content=extracted_text.encode(
71 "utf-8", errors="ignore"
72 ),
73 is_success=True,
74 )
76 return DownloadResult(
77 skip_reason=f"Could not retrieve full text for arXiv:{arxiv_id}"
78 )
79 else:
80 # Download PDF
81 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
82 logger.info(f"Downloading arXiv PDF: {arxiv_id}")
84 pdf_content = super()._download_pdf(pdf_url)
85 if pdf_content:
86 return DownloadResult(content=pdf_content, is_success=True)
87 else:
88 return DownloadResult(
89 skip_reason=f"Failed to download PDF for arXiv:{arxiv_id} - server may be unavailable"
90 )
92 def _download_pdf(self, url: str) -> Optional[bytes]:
93 """Download PDF from arXiv."""
94 # Extract arXiv ID
95 arxiv_id = self._extract_arxiv_id(url)
96 if not arxiv_id:
97 logger.error(f"Could not extract arXiv ID from {url}")
98 return None
100 # Construct PDF URL
101 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
103 logger.info(f"Downloading arXiv PDF: {arxiv_id}")
105 # Use honest user agent - arXiv supports academic tools with proper identification
106 enhanced_headers = {
107 "User-Agent": USER_AGENT,
108 "Accept": "application/pdf,application/octet-stream,*/*",
109 "Accept-Language": "en-US,en;q=0.9",
110 "Accept-Encoding": "gzip, deflate, br",
111 "Connection": "keep-alive",
112 }
114 return super()._download_pdf(pdf_url, headers=enhanced_headers)
116 def _download_text(self, url: str) -> Optional[bytes]:
117 """Get full text content from arXiv PDF (with metadata from API)."""
118 # Extract arXiv ID
119 arxiv_id = self._extract_arxiv_id(url)
120 if not arxiv_id:
121 return None
123 # Download PDF for full text extraction
124 logger.info(f"Downloading arXiv PDF for full text: {arxiv_id}")
125 pdf_content = self._download_pdf(url)
126 if pdf_content:
127 extracted_text = self.extract_text_from_pdf(pdf_content)
128 if extracted_text:
129 # Get metadata from API to prepend
130 metadata = self._fetch_from_arxiv_api(arxiv_id)
131 if metadata:
132 # Combine metadata with full text
133 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"
134 return full_text.encode("utf-8", errors="ignore")
135 else:
136 return extracted_text.encode("utf-8", errors="ignore")
138 return None
140 def _extract_arxiv_id(self, url: str) -> Optional[str]:
141 """Extract arXiv ID from URL."""
142 # Handle different arXiv URL formats
143 patterns = [
144 r"arxiv\.org/abs/(\d+\.\d+)(?:v\d+)?", # New format: 2301.12345 or 2301.12345v2
145 r"arxiv\.org/pdf/(\d+\.\d+)(?:v\d+)?", # PDF URL with optional version
146 r"arxiv\.org/abs/([a-z-]+/\d+)(?:v\d+)?", # Old format: cond-mat/0501234
147 r"arxiv\.org/pdf/([a-z-]+/\d+)(?:v\d+)?", # Old PDF URL with optional version
148 ]
150 for pattern in patterns:
151 match = re.search(pattern, url)
152 if match:
153 return match.group(1)
155 return None
157 def _fetch_from_arxiv_api(self, arxiv_id: str) -> Optional[str]:
158 """Fetch abstract and metadata from arXiv API."""
159 try:
160 # Clean the ID for API query
161 clean_id = arxiv_id.replace("/", "")
163 # Query arXiv API
164 api_url = f"https://export.arxiv.org/api/query?id_list={clean_id}"
165 response = self.session.get(api_url, timeout=10)
167 if response.status_code == 200:
168 # Parse the Atom feed response
169 import xml.etree.ElementTree as ET
171 root = ET.fromstring(response.text)
173 # Define namespaces (URIs are identifiers, not URLs to fetch)
174 ns = {
175 "atom": "http://www.w3.org/2005/Atom", # DevSkim: ignore DS137138
176 "arxiv": "http://arxiv.org/schemas/atom", # DevSkim: ignore DS137138
177 }
179 # Find the entry
180 entry = root.find("atom:entry", ns)
181 if entry is not None: 181 ↛ 232line 181 didn't jump to line 232 because the condition on line 181 was always true
182 # Extract text content
183 text_parts = []
185 # Title
186 title = entry.find("atom:title", ns)
187 if title is not None and title.text: 187 ↛ 191line 187 didn't jump to line 191 because the condition on line 187 was always true
188 text_parts.append(f"Title: {title.text.strip()}")
190 # Authors
191 authors = entry.findall("atom:author", ns)
192 if authors: 192 ↛ 204line 192 didn't jump to line 204 because the condition on line 192 was always true
193 author_names = []
194 for author in authors:
195 name = author.find("atom:name", ns)
196 if name is not None and name.text: 196 ↛ 194line 196 didn't jump to line 194 because the condition on line 196 was always true
197 author_names.append(name.text.strip())
198 if author_names: 198 ↛ 204line 198 didn't jump to line 204 because the condition on line 198 was always true
199 text_parts.append(
200 f"Authors: {', '.join(author_names)}"
201 )
203 # Abstract
204 summary = entry.find("atom:summary", ns)
205 if summary is not None and summary.text: 205 ↛ 211line 205 didn't jump to line 211 because the condition on line 205 was always true
206 text_parts.append(
207 f"\nAbstract:\n{summary.text.strip()}"
208 )
210 # Categories
211 categories = entry.findall("atom:category", ns)
212 if categories: 212 ↛ 213line 212 didn't jump to line 213 because the condition on line 212 was never true
213 cat_terms = [
214 cat.get("term")
215 for cat in categories
216 if cat.get("term")
217 ]
218 if cat_terms:
219 text_parts.append(
220 f"\nCategories: {', '.join(cat_terms)}"
221 )
223 if text_parts: 223 ↛ 232line 223 didn't jump to line 232 because the condition on line 223 was always true
224 logger.info(
225 f"Retrieved text content from arXiv API for {arxiv_id}"
226 )
227 return "\n".join(text_parts)
229 except Exception as e:
230 logger.debug(f"Failed to fetch from arXiv API: {e}")
232 return None