Coverage for src / local_deep_research / research_library / downloaders / generic.py: 80%
85 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Generic PDF Downloader for unspecified sources
3"""
5from typing import Optional
6import requests
7from urllib.parse import urlparse
8from loguru import logger
10from .base import BaseDownloader, ContentType, DownloadResult
13class GenericDownloader(BaseDownloader):
14 """Generic downloader for any URL - attempts basic PDF download."""
16 def can_handle(self, url: str) -> bool:
17 """Generic downloader can handle any URL as a fallback."""
18 return True
20 def download(
21 self, url: str, content_type: ContentType = ContentType.PDF
22 ) -> Optional[bytes]:
23 """Attempt to download content from any URL."""
24 if content_type == ContentType.TEXT:
25 # For generic sources, we can only extract text from PDF
26 pdf_content = self._download_pdf(url)
27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true
28 text = self.extract_text_from_pdf(pdf_content)
29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true
30 return text.encode("utf-8")
31 return None
32 else:
33 # Try to download as PDF
34 return self._download_pdf(url)
36 def download_with_result(
37 self, url: str, content_type: ContentType = ContentType.PDF
38 ) -> DownloadResult:
39 """Download content and return detailed result with skip reason."""
40 if content_type == ContentType.TEXT:
41 # For generic sources, we can only extract text from PDF
42 pdf_content = self._download_pdf(url)
43 if pdf_content:
44 text = self.extract_text_from_pdf(pdf_content)
45 if text:
46 return DownloadResult(
47 content=text.encode("utf-8"), is_success=True
48 )
49 else:
50 return DownloadResult(
51 skip_reason="PDF downloaded but text extraction failed"
52 )
53 return DownloadResult(skip_reason="Could not download PDF from URL")
54 else:
55 # Try to download as PDF
56 logger.info(f"Attempting generic download from {url}")
58 # Try direct download
59 pdf_content = super()._download_pdf(url)
61 if pdf_content:
62 logger.info(f"Successfully downloaded PDF from {url}")
63 return DownloadResult(content=pdf_content, is_success=True)
65 # If the URL doesn't end with .pdf, try adding it
66 try:
67 parsed = urlparse(url)
68 if not parsed.path.endswith(".pdf"):
69 pdf_url = url.rstrip("/") + ".pdf"
70 logger.debug(f"Trying with .pdf extension: {pdf_url}")
71 pdf_content = super()._download_pdf(pdf_url)
72 else:
73 pdf_content = None
74 except:
75 pdf_content = None
77 if pdf_content: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 logger.info(f"Successfully downloaded PDF from {pdf_url}")
79 return DownloadResult(content=pdf_content, is_success=True)
81 # Try to determine more specific reason
82 try:
83 response = self.session.get(
84 url, timeout=5, allow_redirects=True, stream=True
85 )
87 # Check status code
88 if response.status_code == 200:
89 # Check if it's HTML instead of PDF
90 content_type = response.headers.get(
91 "content-type", ""
92 ).lower()
93 if "text/html" in content_type: 93 ↛ 98line 93 didn't jump to line 98 because the condition on line 93 was always true
94 return DownloadResult(
95 skip_reason="Article page requires login or subscription - no direct PDF link available"
96 )
97 else:
98 return DownloadResult(
99 skip_reason=f"Unexpected content type: {content_type} - expected PDF"
100 )
101 elif response.status_code == 404:
102 return DownloadResult(
103 skip_reason="Article not found (404) - may have been removed or URL is incorrect"
104 )
105 elif response.status_code == 403: 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was always true
106 return DownloadResult(
107 skip_reason="Access denied (403) - article requires subscription or special permissions"
108 )
109 elif response.status_code == 401:
110 return DownloadResult(
111 skip_reason="Authentication required - please login to access this article"
112 )
113 elif response.status_code >= 500:
114 return DownloadResult(
115 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues"
116 )
117 else:
118 return DownloadResult(
119 skip_reason=f"Unable to access article - server returned error code {response.status_code}"
120 )
121 except requests.exceptions.Timeout:
122 return DownloadResult(
123 skip_reason="Connection timed out - server took too long to respond"
124 )
125 except requests.exceptions.ConnectionError:
126 return DownloadResult(
127 skip_reason="Could not connect to server - website may be down"
128 )
129 except:
130 return DownloadResult(
131 skip_reason="Network error - could not reach the website"
132 )
134 def _download_pdf(self, url: str) -> Optional[bytes]:
135 """Attempt to download PDF from URL."""
136 logger.info(f"Attempting generic download from {url}")
138 # Try direct download
139 pdf_content = super()._download_pdf(url)
141 if pdf_content:
142 logger.info(f"Successfully downloaded PDF from {url}")
143 return pdf_content
145 # If the URL doesn't end with .pdf, try adding it
146 try:
147 parsed = urlparse(url)
148 if not parsed.path.endswith(".pdf"):
149 pdf_url = url.rstrip("/") + ".pdf"
150 logger.debug(f"Trying with .pdf extension: {pdf_url}")
151 pdf_content = super()._download_pdf(pdf_url)
152 else:
153 pdf_content = None
154 except:
155 pdf_content = None
157 if pdf_content:
158 logger.info(f"Successfully downloaded PDF from {pdf_url}")
159 return pdf_content
161 logger.warning(f"Failed to download PDF from {url}")
162 return None