Coverage for src / local_deep_research / research_library / downloaders / generic.py: 80%
86 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Generic PDF Downloader for unspecified sources
3"""
5from typing import Optional
6import requests
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult
13class GenericDownloader(BaseDownloader):
14 """Generic downloader for any URL - attempts basic PDF download."""
16 def can_handle(self, url: str) -> bool:
17 """Generic downloader can handle any URL as a fallback."""
18 return True
20 def download(
21 self, url: str, content_type: ContentType = ContentType.PDF
22 ) -> Optional[bytes]:
23 """Attempt to download content from any URL."""
24 if content_type == ContentType.TEXT:
25 # For generic sources, we can only extract text from PDF
26 pdf_content = self._download_pdf(url)
27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true
28 text = self.extract_text_from_pdf(pdf_content)
29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true
30 return text.encode("utf-8")
31 return None
32 else:
33 # Try to download as PDF
34 return self._download_pdf(url)
36 def download_with_result(
37 self, url: str, content_type: ContentType = ContentType.PDF
38 ) -> DownloadResult:
39 """Download content and return detailed result with skip reason."""
40 if content_type == ContentType.TEXT:
41 # For generic sources, we can only extract text from PDF
42 pdf_content = self._download_pdf(url)
43 if pdf_content:
44 text = self.extract_text_from_pdf(pdf_content)
45 if text:
46 return DownloadResult(
47 content=text.encode("utf-8"), is_success=True
48 )
49 else:
50 return DownloadResult(
51 skip_reason="PDF downloaded but text extraction failed"
52 )
53 return DownloadResult(skip_reason="Could not download PDF from URL")
54 else:
55 # Try to download as PDF
56 logger.info(f"Attempting generic download from {url}")
58 # Try direct download
59 pdf_content = super()._download_pdf(url)
61 if pdf_content:
62 logger.info(f"Successfully downloaded PDF from {url}")
63 return DownloadResult(content=pdf_content, is_success=True)
65 # If the URL doesn't end with .pdf, try adding it
66 try:
67 parsed = urlparse(url)
68 if not parsed.path.endswith(".pdf"):
69 pdf_url = url.rstrip("/") + ".pdf"
70 logger.debug(f"Trying with .pdf extension: {pdf_url}")
71 pdf_content = super()._download_pdf(pdf_url)
72 else:
73 pdf_content = None
74 except (ValueError, AttributeError):
75 # urlparse can raise ValueError for malformed URLs
76 pdf_content = None
78 if pdf_content: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 logger.info(f"Successfully downloaded PDF from {pdf_url}")
80 return DownloadResult(content=pdf_content, is_success=True)
82 # Try to determine more specific reason
83 try:
84 response = self.session.get(
85 url, timeout=5, allow_redirects=True, stream=True
86 )
88 # Check status code
89 if response.status_code == 200:
90 # Check if it's HTML instead of PDF
91 content_type = response.headers.get(
92 "content-type", ""
93 ).lower()
94 if "text/html" in content_type: 94 ↛ 99line 94 didn't jump to line 99 because the condition on line 94 was always true
95 return DownloadResult(
96 skip_reason="Article page requires login or subscription - no direct PDF link available"
97 )
98 else:
99 return DownloadResult(
100 skip_reason=f"Unexpected content type: {content_type} - expected PDF"
101 )
102 elif response.status_code == 404:
103 return DownloadResult(
104 skip_reason="Article not found (404) - may have been removed or URL is incorrect"
105 )
106 elif response.status_code == 403: 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was always true
107 return DownloadResult(
108 skip_reason="Access denied (403) - article requires subscription or special permissions"
109 )
110 elif response.status_code == 401:
111 return DownloadResult(
112 skip_reason="Authentication required - please login to access this article"
113 )
114 elif response.status_code >= 500:
115 return DownloadResult(
116 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues"
117 )
118 else:
119 return DownloadResult(
120 skip_reason=f"Unable to access article - server returned error code {response.status_code}"
121 )
122 except requests.exceptions.Timeout:
123 return DownloadResult(
124 skip_reason="Connection timed out - server took too long to respond"
125 )
126 except requests.exceptions.ConnectionError:
127 return DownloadResult(
128 skip_reason="Could not connect to server - website may be down"
129 )
130 except requests.RequestException:
131 logger.warning("Unexpected error checking URL: %s", url)
132 return DownloadResult(
133 skip_reason="Network error - could not reach the website"
134 )
136 def _download_pdf(self, url: str) -> Optional[bytes]:
137 """Attempt to download PDF from URL."""
138 logger.info(f"Attempting generic download from {url}")
140 # Try direct download
141 pdf_content = super()._download_pdf(url)
143 if pdf_content:
144 logger.info(f"Successfully downloaded PDF from {url}")
145 return pdf_content
147 # If the URL doesn't end with .pdf, try adding it
148 try:
149 parsed = urlparse(url)
150 if not parsed.path.endswith(".pdf"):
151 pdf_url = url.rstrip("/") + ".pdf"
152 logger.debug(f"Trying with .pdf extension: {pdf_url}")
153 pdf_content = super()._download_pdf(pdf_url)
154 else:
155 pdf_content = None
156 except (ValueError, AttributeError):
157 # urlparse can raise ValueError for malformed URLs
158 pdf_content = None
160 if pdf_content:
161 logger.info(f"Successfully downloaded PDF from {pdf_url}")
162 return pdf_content
164 logger.warning(f"Failed to download PDF from {url}")
165 return None