Coverage for src / local_deep_research / research_library / downloaders / direct_pdf.py: 49%
59 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Direct PDF Link Downloader
3"""
5from typing import Optional
6from urllib.parse import urlparse
7from loguru import logger
9from .base import BaseDownloader, ContentType, DownloadResult
12class DirectPDFDownloader(BaseDownloader):
13 """Downloader for direct PDF links."""
15 def can_handle(self, url: str) -> bool:
16 """Check if URL is a direct PDF link using proper URL parsing."""
17 try:
18 # Parse the URL
19 parsed = urlparse(url.lower())
20 path = parsed.path or ""
21 query = parsed.query or ""
23 # Check for .pdf extension in path
24 if path.endswith(".pdf"):
25 return True
27 # Check for .pdf with query parameters
28 if ".pdf?" in url.lower(): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true
29 return True
31 # Check for /pdf/ in path
32 if "/pdf/" in path: 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true
33 return True
35 # Check query parameters for PDF format
36 if "type=pdf" in query or "format=pdf" in query: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true
37 return True
39 return False
41 except Exception as e:
42 logger.warning(f"Error parsing URL {url}: {e}")
43 return False
45 def download(
46 self, url: str, content_type: ContentType = ContentType.PDF
47 ) -> Optional[bytes]:
48 """Download PDF directly from URL."""
49 if content_type == ContentType.TEXT: 49 ↛ 51line 49 didn't jump to line 51 because the condition on line 49 was never true
50 # Download PDF and extract text
51 pdf_content = self._download_pdf(url)
52 if pdf_content:
53 text = self.extract_text_from_pdf(pdf_content)
54 if text:
55 return text.encode("utf-8")
56 return None
57 else:
58 return self._download_pdf(url)
60 def download_with_result(
61 self, url: str, content_type: ContentType = ContentType.PDF
62 ) -> DownloadResult:
63 """Download content and return detailed result with skip reason."""
64 if content_type == ContentType.TEXT: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was never true
65 # Download PDF and extract text
66 pdf_content = self._download_pdf(url)
67 if pdf_content:
68 text = self.extract_text_from_pdf(pdf_content)
69 if text:
70 return DownloadResult(
71 content=text.encode("utf-8"), is_success=True
72 )
73 else:
74 return DownloadResult(
75 skip_reason="PDF downloaded but text extraction failed"
76 )
77 return DownloadResult(
78 skip_reason="Could not download PDF from direct link"
79 )
80 else:
81 # Try to download PDF directly
82 logger.info(f"Attempting direct PDF download from {url}")
83 pdf_content = super()._download_pdf(url)
85 if pdf_content:
86 logger.info(f"Successfully downloaded PDF directly from {url}")
87 return DownloadResult(content=pdf_content, is_success=True)
88 else:
89 # Try to determine specific reason for failure
90 try:
91 response = self.session.head(
92 url, timeout=5, allow_redirects=True
93 )
94 if response.status_code == 404: 94 ↛ 98line 94 didn't jump to line 98 because the condition on line 94 was always true
95 return DownloadResult(
96 skip_reason="PDF file not found (404) - link may be broken"
97 )
98 elif response.status_code == 403:
99 return DownloadResult(
100 skip_reason="Access denied (403) - PDF requires authentication"
101 )
102 elif response.status_code >= 500:
103 return DownloadResult(
104 skip_reason=f"Server error ({response.status_code}) - try again later"
105 )
106 else:
107 return DownloadResult(
108 skip_reason=f"Could not download PDF - server returned status {response.status_code}"
109 )
110 except Exception:
111 return DownloadResult(
112 skip_reason="Failed to download PDF from direct link"
113 )
115 def _download_pdf(self, url: str) -> Optional[bytes]:
116 """Download PDF directly from URL."""
117 logger.info(f"Downloading PDF directly from: {url}")
118 return super()._download_pdf(url)