Coverage for src / local_deep_research / research_library / downloaders / generic.py: 94%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Generic PDF Downloader for unspecified sources 

3""" 

4 

5from typing import Dict, Optional 

6import requests 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult 

11 

12 

13class GenericDownloader(BaseDownloader): 

14 """Generic downloader for any URL - attempts basic PDF download.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Generic downloader can handle any URL as a fallback.""" 

18 return True 

19 

20 def download( 

21 self, url: str, content_type: ContentType = ContentType.PDF 

22 ) -> Optional[bytes]: 

23 """Attempt to download content from any URL.""" 

24 if content_type == ContentType.TEXT: 

25 # For generic sources, we can only extract text from PDF 

26 pdf_content = self._download_pdf(url) 

27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true

28 text = self.extract_text_from_pdf(pdf_content) 

29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 return text.encode("utf-8") 

31 return None 

32 # Try to download as PDF 

33 return self._download_pdf(url) 

34 

35 def download_with_result( 

36 self, url: str, content_type: ContentType = ContentType.PDF 

37 ) -> DownloadResult: 

38 """Download content and return detailed result with skip reason.""" 

39 if content_type == ContentType.TEXT: 

40 # For generic sources, we can only extract text from PDF 

41 pdf_content = self._download_pdf(url) 

42 if pdf_content: 

43 text = self.extract_text_from_pdf(pdf_content) 

44 if text: 

45 return DownloadResult( 

46 content=text.encode("utf-8"), is_success=True 

47 ) 

48 return DownloadResult( 

49 skip_reason="PDF downloaded but text extraction failed" 

50 ) 

51 return DownloadResult(skip_reason="Could not download PDF from URL") 

52 # Try to download as PDF 

53 logger.info(f"Attempting generic download from {url}") 

54 

55 # Try direct download 

56 pdf_content = super()._download_pdf(url) 

57 

58 if pdf_content: 

59 logger.info(f"Successfully downloaded PDF from {url}") 

60 return DownloadResult(content=pdf_content, is_success=True) 

61 

62 # If the URL doesn't end with .pdf, try adding it 

63 try: 

64 parsed = urlparse(url) 

65 if not parsed.path.endswith(".pdf"): 

66 pdf_url = url.rstrip("/") + ".pdf" 

67 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

68 pdf_content = super()._download_pdf(pdf_url) 

69 else: 

70 pdf_content = None 

71 except (ValueError, AttributeError): 

72 # urlparse can raise ValueError for malformed URLs 

73 pdf_content = None 

74 

75 if pdf_content: 

76 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

77 return DownloadResult(content=pdf_content, is_success=True) 

78 

79 # Diagnostic request: determine WHY the download failed. 

80 # 

81 # IMPORTANT: stream=True is intentional here. DO NOT remove it. 

82 # This block only inspects response.status_code and headers 

83 # to determine why a download failed (404, 403, paywall, etc.). 

84 # Without stream=True, the full response body would be downloaded 

85 # into memory. Since GenericDownloader.can_handle() returns True 

86 # for ALL URLs, this could mean downloading multi-GB files just 

87 # to check a status code. 

88 # 

89 # The context manager (with ... as response) ensures the streamed 

90 # connection is properly closed on all code paths, preventing 

91 # file descriptor leaks (each unclosed stream=True response 

92 # holds an open socket FD). 

93 try: 

94 with self.session.get( 

95 url, timeout=5, allow_redirects=True, stream=True 

96 ) as response: 

97 # Check status code 

98 if response.status_code == 200: 

99 # Check if it's HTML instead of PDF 

100 response_content_type = response.headers.get( 

101 "content-type", "" 

102 ).lower() 

103 if "text/html" in response_content_type: 

104 return DownloadResult( 

105 skip_reason="Article page requires login or subscription - no direct PDF link available", 

106 status_code=response.status_code, 

107 ) 

108 return DownloadResult( 

109 skip_reason=f"Unexpected content type: {response_content_type} - expected PDF", 

110 status_code=response.status_code, 

111 ) 

112 if response.status_code == 404: 

113 return DownloadResult( 

114 skip_reason="Article not found (404) - may have been removed or URL is incorrect", 

115 status_code=404, 

116 ) 

117 if response.status_code == 403: 

118 return DownloadResult( 

119 skip_reason="Access denied (403) - article requires subscription or special permissions", 

120 status_code=403, 

121 ) 

122 if response.status_code == 401: 

123 return DownloadResult( 

124 skip_reason="Authentication required - please login to access this article", 

125 status_code=401, 

126 ) 

127 if response.status_code >= 500: 

128 return DownloadResult( 

129 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues", 

130 status_code=response.status_code, 

131 ) 

132 return DownloadResult( 

133 skip_reason=f"Unable to access article - server returned error code {response.status_code}", 

134 status_code=response.status_code, 

135 ) 

136 except requests.exceptions.Timeout: 

137 return DownloadResult( 

138 skip_reason="Connection timed out - server took too long to respond" 

139 ) 

140 except requests.exceptions.ConnectionError: 

141 return DownloadResult( 

142 skip_reason="Could not connect to server - website may be down" 

143 ) 

144 except requests.RequestException: 

145 logger.warning("Unexpected error checking URL: {}", url) 

146 return DownloadResult( 

147 skip_reason="Network error - could not reach the website" 

148 ) 

149 

150 def _download_pdf( 

151 self, url: str, headers: Optional[Dict[str, str]] = None 

152 ) -> Optional[bytes]: 

153 """Attempt to download PDF from URL.""" 

154 logger.info(f"Attempting generic download from {url}") 

155 

156 # Try direct download 

157 pdf_content = super()._download_pdf(url) 

158 

159 if pdf_content: 

160 logger.info(f"Successfully downloaded PDF from {url}") 

161 return pdf_content 

162 

163 # If the URL doesn't end with .pdf, try adding it 

164 try: 

165 parsed = urlparse(url) 

166 if not parsed.path.endswith(".pdf"): 

167 pdf_url = url.rstrip("/") + ".pdf" 

168 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

169 pdf_content = super()._download_pdf(pdf_url) 

170 else: 

171 pdf_content = None 

172 except (ValueError, AttributeError): 

173 # urlparse can raise ValueError for malformed URLs 

174 pdf_content = None 

175 

176 if pdf_content: 

177 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

178 return pdf_content 

179 

180 logger.warning(f"Failed to download PDF from {url}") 

181 return None