Coverage for src / local_deep_research / research_library / downloaders / arxiv.py: 96%
107 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2arXiv PDF and Text Downloader
3"""
5import re
6from typing import Dict, Optional
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult, USER_AGENT
13class ArxivDownloader(BaseDownloader):
14 """Downloader for arXiv papers with PDF and abstract/text support."""
16 def can_handle(self, url: str) -> bool:
17 """Check if URL is from arXiv."""
18 try:
19 hostname = urlparse(url).hostname
20 return bool(
21 hostname
22 and (hostname == "arxiv.org" or hostname.endswith(".arxiv.org"))
23 )
24 except Exception:
25 return False
27 def download(
28 self, url: str, content_type: ContentType = ContentType.PDF
29 ) -> Optional[bytes]:
30 """Download content from arXiv."""
31 if content_type == ContentType.TEXT: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true
32 return self._download_text(url)
33 return self._download_pdf(url)
35 def download_with_result(
36 self, url: str, content_type: ContentType = ContentType.PDF
37 ) -> DownloadResult:
38 """Download content and return detailed result with skip reason."""
39 # Extract arXiv ID
40 arxiv_id = self._extract_arxiv_id(url)
41 if not arxiv_id:
42 return DownloadResult(
43 skip_reason="Invalid arXiv URL - could not extract article ID"
44 )
46 if content_type == ContentType.TEXT:
47 # ArXiv API only provides abstracts, not full text
48 # We need to download the PDF and extract full text
49 logger.info(
50 f"Downloading arXiv PDF for full text extraction: {arxiv_id}"
51 )
53 pdf_content = self._download_pdf(url)
54 if pdf_content:
55 extracted_text = self.extract_text_from_pdf(pdf_content)
56 if extracted_text:
57 # Optionally prepend metadata from API
58 metadata = self._fetch_from_arxiv_api(arxiv_id)
59 if metadata:
60 # Combine metadata with full text
61 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"
62 return DownloadResult(
63 content=full_text.encode("utf-8", errors="ignore"),
64 is_success=True,
65 )
66 # Just return the extracted text
67 return DownloadResult(
68 content=extracted_text.encode("utf-8", errors="ignore"),
69 is_success=True,
70 )
72 return DownloadResult(
73 skip_reason=f"Could not retrieve full text for arXiv:{arxiv_id}"
74 )
75 # Download PDF
76 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
77 logger.info(f"Downloading arXiv PDF: {arxiv_id}")
79 pdf_content = super()._download_pdf(pdf_url)
80 if pdf_content:
81 return DownloadResult(content=pdf_content, is_success=True)
82 return DownloadResult(
83 skip_reason=f"Failed to download PDF for arXiv:{arxiv_id} - server may be unavailable"
84 )
86 def _download_pdf(
87 self, url: str, headers: Optional[Dict[str, str]] = None
88 ) -> Optional[bytes]:
89 """Download PDF from arXiv."""
90 # Extract arXiv ID
91 arxiv_id = self._extract_arxiv_id(url)
92 if not arxiv_id:
93 logger.error(f"Could not extract arXiv ID from {url}")
94 return None
96 # Construct PDF URL
97 pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
99 logger.info(f"Downloading arXiv PDF: {arxiv_id}")
101 # Use honest user agent - arXiv supports academic tools with proper identification
102 enhanced_headers = {
103 "User-Agent": USER_AGENT,
104 "Accept": "application/pdf,application/octet-stream,*/*",
105 "Accept-Language": "en-US,en;q=0.9",
106 "Accept-Encoding": "gzip, deflate, br",
107 "Connection": "keep-alive",
108 }
110 return super()._download_pdf(pdf_url, headers=enhanced_headers)
112 def _download_text(self, url: str) -> Optional[bytes]:
113 """Get full text content from arXiv PDF (with metadata from API)."""
114 # Extract arXiv ID
115 arxiv_id = self._extract_arxiv_id(url)
116 if not arxiv_id:
117 return None
119 # Download PDF for full text extraction
120 logger.info(f"Downloading arXiv PDF for full text: {arxiv_id}")
121 pdf_content = self._download_pdf(url)
122 if pdf_content:
123 extracted_text = self.extract_text_from_pdf(pdf_content)
124 if extracted_text:
125 # Get metadata from API to prepend
126 metadata = self._fetch_from_arxiv_api(arxiv_id)
127 if metadata:
128 # Combine metadata with full text
129 full_text = f"{metadata}\n\n{'=' * 80}\nFULL PAPER TEXT\n{'=' * 80}\n\n{extracted_text}"
130 return full_text.encode("utf-8", errors="ignore")
131 return extracted_text.encode("utf-8", errors="ignore")
133 return None
135 def _extract_arxiv_id(self, url: str) -> Optional[str]:
136 """Extract arXiv ID from URL."""
137 # Handle different arXiv URL formats
138 patterns = [
139 r"arxiv\.org/abs/(\d+\.\d+)(?:v\d+)?", # New format: 2301.12345 or 2301.12345v2
140 r"arxiv\.org/pdf/(\d+\.\d+)(?:v\d+)?", # PDF URL with optional version
141 r"arxiv\.org/abs/([a-z-]+/\d+)(?:v\d+)?", # Old format: cond-mat/0501234
142 r"arxiv\.org/pdf/([a-z-]+/\d+)(?:v\d+)?", # Old PDF URL with optional version
143 ]
145 for pattern in patterns:
146 match = re.search(pattern, url)
147 if match:
148 return match.group(1)
150 return None
152 def _fetch_from_arxiv_api(self, arxiv_id: str) -> Optional[str]:
153 """Fetch abstract and metadata from arXiv API."""
154 try:
155 # Clean the ID for API query
156 clean_id = arxiv_id.replace("/", "")
158 # Query arXiv API
159 api_url = f"https://export.arxiv.org/api/query?id_list={clean_id}"
160 response = self.session.get(api_url, timeout=10)
162 if response.status_code == 200:
163 # Parse the Atom feed response
164 # Use defusedxml to prevent XXE attacks
165 from defusedxml import ElementTree as ET
167 root = ET.fromstring(response.text)
169 # Define namespaces (URIs are identifiers, not URLs to fetch)
170 ns = {
171 "atom": "http://www.w3.org/2005/Atom", # DevSkim: ignore DS137138
172 "arxiv": "http://arxiv.org/schemas/atom", # DevSkim: ignore DS137138
173 }
175 # Find the entry
176 entry = root.find("atom:entry", ns)
177 if entry is not None:
178 # Extract text content
179 text_parts = []
181 # Title
182 title = entry.find("atom:title", ns)
183 if title is not None and title.text:
184 text_parts.append(f"Title: {title.text.strip()}")
186 # Authors
187 authors = entry.findall("atom:author", ns)
188 if authors:
189 author_names = []
190 for author in authors:
191 name = author.find("atom:name", ns)
192 if name is not None and name.text: 192 ↛ 190line 192 didn't jump to line 190 because the condition on line 192 was always true
193 author_names.append(name.text.strip())
194 if author_names: 194 ↛ 200line 194 didn't jump to line 200 because the condition on line 194 was always true
195 text_parts.append(
196 f"Authors: {', '.join(author_names)}"
197 )
199 # Abstract
200 summary = entry.find("atom:summary", ns)
201 if summary is not None and summary.text:
202 text_parts.append(
203 f"\nAbstract:\n{summary.text.strip()}"
204 )
206 # Categories
207 categories = entry.findall("atom:category", ns)
208 if categories:
209 cat_terms = [
210 cat.get("term")
211 for cat in categories
212 if cat.get("term")
213 ]
214 if cat_terms: 214 ↛ 219line 214 didn't jump to line 219 because the condition on line 214 was always true
215 text_parts.append(
216 f"\nCategories: {', '.join(cat_terms)}"
217 )
219 if text_parts:
220 logger.info(
221 f"Retrieved text content from arXiv API for {arxiv_id}"
222 )
223 return "\n".join(text_parts)
225 except Exception as e:
226 logger.debug(f"Failed to fetch from arXiv API: {e}")
228 return None