Coverage for src / local_deep_research / research_library / downloaders / generic.py: 94%
86 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Generic PDF Downloader for unspecified sources
3"""
5from typing import Dict, Optional
6import requests
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult
13class GenericDownloader(BaseDownloader):
14 """Generic downloader for any URL - attempts basic PDF download."""
16 def can_handle(self, url: str) -> bool:
17 """Generic downloader can handle any URL as a fallback."""
18 return True
20 def download(
21 self, url: str, content_type: ContentType = ContentType.PDF
22 ) -> Optional[bytes]:
23 """Attempt to download content from any URL."""
24 if content_type == ContentType.TEXT:
25 # For generic sources, we can only extract text from PDF
26 pdf_content = self._download_pdf(url)
27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true
28 text = self.extract_text_from_pdf(pdf_content)
29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true
30 return text.encode("utf-8")
31 return None
32 # Try to download as PDF
33 return self._download_pdf(url)
35 def download_with_result(
36 self, url: str, content_type: ContentType = ContentType.PDF
37 ) -> DownloadResult:
38 """Download content and return detailed result with skip reason."""
39 if content_type == ContentType.TEXT:
40 # For generic sources, we can only extract text from PDF
41 pdf_content = self._download_pdf(url)
42 if pdf_content:
43 text = self.extract_text_from_pdf(pdf_content)
44 if text:
45 return DownloadResult(
46 content=text.encode("utf-8"), is_success=True
47 )
48 return DownloadResult(
49 skip_reason="PDF downloaded but text extraction failed"
50 )
51 return DownloadResult(skip_reason="Could not download PDF from URL")
52 # Try to download as PDF
53 logger.info(f"Attempting generic download from {url}")
55 # Try direct download
56 pdf_content = super()._download_pdf(url)
58 if pdf_content:
59 logger.info(f"Successfully downloaded PDF from {url}")
60 return DownloadResult(content=pdf_content, is_success=True)
62 # If the URL doesn't end with .pdf, try adding it
63 try:
64 parsed = urlparse(url)
65 if not parsed.path.endswith(".pdf"):
66 pdf_url = url.rstrip("/") + ".pdf"
67 logger.debug(f"Trying with .pdf extension: {pdf_url}")
68 pdf_content = super()._download_pdf(pdf_url)
69 else:
70 pdf_content = None
71 except (ValueError, AttributeError):
72 # urlparse can raise ValueError for malformed URLs
73 pdf_content = None
75 if pdf_content:
76 logger.info(f"Successfully downloaded PDF from {pdf_url}")
77 return DownloadResult(content=pdf_content, is_success=True)
79 # Diagnostic request: determine WHY the download failed.
80 #
81 # IMPORTANT: stream=True is intentional here. DO NOT remove it.
82 # This block only inspects response.status_code and headers
83 # to determine why a download failed (404, 403, paywall, etc.).
84 # Without stream=True, the full response body would be downloaded
85 # into memory. Since GenericDownloader.can_handle() returns True
86 # for ALL URLs, this could mean downloading multi-GB files just
87 # to check a status code.
88 #
89 # The context manager (with ... as response) ensures the streamed
90 # connection is properly closed on all code paths, preventing
91 # file descriptor leaks (each unclosed stream=True response
92 # holds an open socket FD).
93 try:
94 with self.session.get(
95 url, timeout=5, allow_redirects=True, stream=True
96 ) as response:
97 # Check status code
98 if response.status_code == 200:
99 # Check if it's HTML instead of PDF
100 response_content_type = response.headers.get(
101 "content-type", ""
102 ).lower()
103 if "text/html" in response_content_type:
104 return DownloadResult(
105 skip_reason="Article page requires login or subscription - no direct PDF link available",
106 status_code=response.status_code,
107 )
108 return DownloadResult(
109 skip_reason=f"Unexpected content type: {response_content_type} - expected PDF",
110 status_code=response.status_code,
111 )
112 if response.status_code == 404:
113 return DownloadResult(
114 skip_reason="Article not found (404) - may have been removed or URL is incorrect",
115 status_code=404,
116 )
117 if response.status_code == 403:
118 return DownloadResult(
119 skip_reason="Access denied (403) - article requires subscription or special permissions",
120 status_code=403,
121 )
122 if response.status_code == 401:
123 return DownloadResult(
124 skip_reason="Authentication required - please login to access this article",
125 status_code=401,
126 )
127 if response.status_code >= 500:
128 return DownloadResult(
129 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues",
130 status_code=response.status_code,
131 )
132 return DownloadResult(
133 skip_reason=f"Unable to access article - server returned error code {response.status_code}",
134 status_code=response.status_code,
135 )
136 except requests.exceptions.Timeout:
137 return DownloadResult(
138 skip_reason="Connection timed out - server took too long to respond"
139 )
140 except requests.exceptions.ConnectionError:
141 return DownloadResult(
142 skip_reason="Could not connect to server - website may be down"
143 )
144 except requests.RequestException:
145 logger.warning("Unexpected error checking URL: {}", url)
146 return DownloadResult(
147 skip_reason="Network error - could not reach the website"
148 )
150 def _download_pdf(
151 self, url: str, headers: Optional[Dict[str, str]] = None
152 ) -> Optional[bytes]:
153 """Attempt to download PDF from URL."""
154 logger.info(f"Attempting generic download from {url}")
156 # Try direct download
157 pdf_content = super()._download_pdf(url)
159 if pdf_content:
160 logger.info(f"Successfully downloaded PDF from {url}")
161 return pdf_content
163 # If the URL doesn't end with .pdf, try adding it
164 try:
165 parsed = urlparse(url)
166 if not parsed.path.endswith(".pdf"):
167 pdf_url = url.rstrip("/") + ".pdf"
168 logger.debug(f"Trying with .pdf extension: {pdf_url}")
169 pdf_content = super()._download_pdf(pdf_url)
170 else:
171 pdf_content = None
172 except (ValueError, AttributeError):
173 # urlparse can raise ValueError for malformed URLs
174 pdf_content = None
176 if pdf_content:
177 logger.info(f"Successfully downloaded PDF from {pdf_url}")
178 return pdf_content
180 logger.warning(f"Failed to download PDF from {url}")
181 return None