Coverage for src / local_deep_research / research_library / downloaders / direct_pdf.py: 98%
59 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Direct PDF Link Downloader
3"""
5from typing import Dict, Optional
6from urllib.parse import urlparse
7from loguru import logger
9from .base import BaseDownloader, ContentType, DownloadResult
12class DirectPDFDownloader(BaseDownloader):
13 """Downloader for direct PDF links."""
15 def can_handle(self, url: str) -> bool:
16 """Check if URL is a direct PDF link using proper URL parsing."""
17 try:
18 # Parse the URL
19 parsed = urlparse(url.lower())
20 path = parsed.path or ""
21 query = parsed.query or ""
23 # Check for .pdf extension in path
24 if path.endswith(".pdf"):
25 return True
27 # Check for .pdf with query parameters
28 if ".pdf?" in url.lower(): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 return True
31 # Check for /pdf/ in path
32 if "/pdf/" in path:
33 return True
35 # Check query parameters for PDF format
36 if "type=pdf" in query or "format=pdf" in query:
37 return True
39 return False
41 except Exception:
42 logger.warning(f"Error parsing URL {url}")
43 return False
45 def download(
46 self, url: str, content_type: ContentType = ContentType.PDF
47 ) -> Optional[bytes]:
48 """Download PDF directly from URL."""
49 if content_type == ContentType.TEXT:
50 # Download PDF and extract text
51 pdf_content = self._download_pdf(url)
52 if pdf_content:
53 text = self.extract_text_from_pdf(pdf_content)
54 if text:
55 return text.encode("utf-8")
56 return None
57 return self._download_pdf(url)
59 def download_with_result(
60 self, url: str, content_type: ContentType = ContentType.PDF
61 ) -> DownloadResult:
62 """Download content and return detailed result with skip reason."""
63 if content_type == ContentType.TEXT:
64 # Download PDF and extract text
65 pdf_content = self._download_pdf(url)
66 if pdf_content:
67 text = self.extract_text_from_pdf(pdf_content)
68 if text:
69 return DownloadResult(
70 content=text.encode("utf-8"), is_success=True
71 )
72 return DownloadResult(
73 skip_reason="PDF downloaded but text extraction failed"
74 )
75 return DownloadResult(
76 skip_reason="Could not download PDF from direct link"
77 )
78 # Try to download PDF directly
79 logger.info(f"Attempting direct PDF download from {url}")
80 pdf_content = super()._download_pdf(url)
82 if pdf_content:
83 logger.info(f"Successfully downloaded PDF directly from {url}")
84 return DownloadResult(content=pdf_content, is_success=True)
85 # Try to determine specific reason for failure
86 try:
87 response = self.session.head(url, timeout=5, allow_redirects=True)
88 if response.status_code == 404:
89 return DownloadResult(
90 skip_reason="PDF file not found (404) - link may be broken",
91 status_code=404,
92 )
93 if response.status_code == 403:
94 return DownloadResult(
95 skip_reason="Access denied (403) - PDF requires authentication",
96 status_code=403,
97 )
98 if response.status_code >= 500:
99 return DownloadResult(
100 skip_reason=f"Server error ({response.status_code}) - try again later",
101 status_code=response.status_code,
102 )
103 return DownloadResult(
104 skip_reason=f"Could not download PDF - server returned status {response.status_code}",
105 status_code=response.status_code,
106 )
107 except Exception:
108 return DownloadResult(
109 skip_reason="Failed to download PDF from direct link"
110 )
112 def _download_pdf(
113 self, url: str, headers: Optional[Dict[str, str]] = None
114 ) -> Optional[bytes]:
115 """Download PDF directly from URL."""
116 logger.info(f"Downloading PDF directly from: {url}")
117 return super()._download_pdf(url)