Coverage for src / local_deep_research / research_library / downloaders / generic.py: 80%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Generic PDF Downloader for unspecified sources 

3""" 

4 

5from typing import Optional 

6import requests 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult 

11 

12 

13class GenericDownloader(BaseDownloader): 

14 """Generic downloader for any URL - attempts basic PDF download.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Generic downloader can handle any URL as a fallback.""" 

18 return True 

19 

20 def download( 

21 self, url: str, content_type: ContentType = ContentType.PDF 

22 ) -> Optional[bytes]: 

23 """Attempt to download content from any URL.""" 

24 if content_type == ContentType.TEXT: 

25 # For generic sources, we can only extract text from PDF 

26 pdf_content = self._download_pdf(url) 

27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true

28 text = self.extract_text_from_pdf(pdf_content) 

29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 return text.encode("utf-8") 

31 return None 

32 else: 

33 # Try to download as PDF 

34 return self._download_pdf(url) 

35 

36 def download_with_result( 

37 self, url: str, content_type: ContentType = ContentType.PDF 

38 ) -> DownloadResult: 

39 """Download content and return detailed result with skip reason.""" 

40 if content_type == ContentType.TEXT: 

41 # For generic sources, we can only extract text from PDF 

42 pdf_content = self._download_pdf(url) 

43 if pdf_content: 

44 text = self.extract_text_from_pdf(pdf_content) 

45 if text: 

46 return DownloadResult( 

47 content=text.encode("utf-8"), is_success=True 

48 ) 

49 else: 

50 return DownloadResult( 

51 skip_reason="PDF downloaded but text extraction failed" 

52 ) 

53 return DownloadResult(skip_reason="Could not download PDF from URL") 

54 else: 

55 # Try to download as PDF 

56 logger.info(f"Attempting generic download from {url}") 

57 

58 # Try direct download 

59 pdf_content = super()._download_pdf(url) 

60 

61 if pdf_content: 

62 logger.info(f"Successfully downloaded PDF from {url}") 

63 return DownloadResult(content=pdf_content, is_success=True) 

64 

65 # If the URL doesn't end with .pdf, try adding it 

66 try: 

67 parsed = urlparse(url) 

68 if not parsed.path.endswith(".pdf"): 

69 pdf_url = url.rstrip("/") + ".pdf" 

70 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

71 pdf_content = super()._download_pdf(pdf_url) 

72 else: 

73 pdf_content = None 

74 except (ValueError, AttributeError): 

75 # urlparse can raise ValueError for malformed URLs 

76 pdf_content = None 

77 

78 if pdf_content: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

80 return DownloadResult(content=pdf_content, is_success=True) 

81 

82 # Try to determine more specific reason 

83 try: 

84 response = self.session.get( 

85 url, timeout=5, allow_redirects=True, stream=True 

86 ) 

87 

88 # Check status code 

89 if response.status_code == 200: 

90 # Check if it's HTML instead of PDF 

91 content_type = response.headers.get( 

92 "content-type", "" 

93 ).lower() 

94 if "text/html" in content_type: 94 ↛ 99line 94 didn't jump to line 99 because the condition on line 94 was always true

95 return DownloadResult( 

96 skip_reason="Article page requires login or subscription - no direct PDF link available" 

97 ) 

98 else: 

99 return DownloadResult( 

100 skip_reason=f"Unexpected content type: {content_type} - expected PDF" 

101 ) 

102 elif response.status_code == 404: 

103 return DownloadResult( 

104 skip_reason="Article not found (404) - may have been removed or URL is incorrect" 

105 ) 

106 elif response.status_code == 403: 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was always true

107 return DownloadResult( 

108 skip_reason="Access denied (403) - article requires subscription or special permissions" 

109 ) 

110 elif response.status_code == 401: 

111 return DownloadResult( 

112 skip_reason="Authentication required - please login to access this article" 

113 ) 

114 elif response.status_code >= 500: 

115 return DownloadResult( 

116 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues" 

117 ) 

118 else: 

119 return DownloadResult( 

120 skip_reason=f"Unable to access article - server returned error code {response.status_code}" 

121 ) 

122 except requests.exceptions.Timeout: 

123 return DownloadResult( 

124 skip_reason="Connection timed out - server took too long to respond" 

125 ) 

126 except requests.exceptions.ConnectionError: 

127 return DownloadResult( 

128 skip_reason="Could not connect to server - website may be down" 

129 ) 

130 except requests.RequestException: 

131 logger.warning("Unexpected error checking URL: %s", url) 

132 return DownloadResult( 

133 skip_reason="Network error - could not reach the website" 

134 ) 

135 

136 def _download_pdf(self, url: str) -> Optional[bytes]: 

137 """Attempt to download PDF from URL.""" 

138 logger.info(f"Attempting generic download from {url}") 

139 

140 # Try direct download 

141 pdf_content = super()._download_pdf(url) 

142 

143 if pdf_content: 

144 logger.info(f"Successfully downloaded PDF from {url}") 

145 return pdf_content 

146 

147 # If the URL doesn't end with .pdf, try adding it 

148 try: 

149 parsed = urlparse(url) 

150 if not parsed.path.endswith(".pdf"): 

151 pdf_url = url.rstrip("/") + ".pdf" 

152 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

153 pdf_content = super()._download_pdf(pdf_url) 

154 else: 

155 pdf_content = None 

156 except (ValueError, AttributeError): 

157 # urlparse can raise ValueError for malformed URLs 

158 pdf_content = None 

159 

160 if pdf_content: 

161 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

162 return pdf_content 

163 

164 logger.warning(f"Failed to download PDF from {url}") 

165 return None