Coverage for src / local_deep_research / research_library / downloaders / generic.py: 80%

85 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Generic PDF Downloader for unspecified sources 

3""" 

4 

5from typing import Optional 

6import requests 

7from urllib.parse import urlparse 

8from loguru import logger 

9 

10from .base import BaseDownloader, ContentType, DownloadResult 

11 

12 

13class GenericDownloader(BaseDownloader): 

14 """Generic downloader for any URL - attempts basic PDF download.""" 

15 

16 def can_handle(self, url: str) -> bool: 

17 """Generic downloader can handle any URL as a fallback.""" 

18 return True 

19 

20 def download( 

21 self, url: str, content_type: ContentType = ContentType.PDF 

22 ) -> Optional[bytes]: 

23 """Attempt to download content from any URL.""" 

24 if content_type == ContentType.TEXT: 

25 # For generic sources, we can only extract text from PDF 

26 pdf_content = self._download_pdf(url) 

27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true

28 text = self.extract_text_from_pdf(pdf_content) 

29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 return text.encode("utf-8") 

31 return None 

32 else: 

33 # Try to download as PDF 

34 return self._download_pdf(url) 

35 

36 def download_with_result( 

37 self, url: str, content_type: ContentType = ContentType.PDF 

38 ) -> DownloadResult: 

39 """Download content and return detailed result with skip reason.""" 

40 if content_type == ContentType.TEXT: 

41 # For generic sources, we can only extract text from PDF 

42 pdf_content = self._download_pdf(url) 

43 if pdf_content: 

44 text = self.extract_text_from_pdf(pdf_content) 

45 if text: 

46 return DownloadResult( 

47 content=text.encode("utf-8"), is_success=True 

48 ) 

49 else: 

50 return DownloadResult( 

51 skip_reason="PDF downloaded but text extraction failed" 

52 ) 

53 return DownloadResult(skip_reason="Could not download PDF from URL") 

54 else: 

55 # Try to download as PDF 

56 logger.info(f"Attempting generic download from {url}") 

57 

58 # Try direct download 

59 pdf_content = super()._download_pdf(url) 

60 

61 if pdf_content: 

62 logger.info(f"Successfully downloaded PDF from {url}") 

63 return DownloadResult(content=pdf_content, is_success=True) 

64 

65 # If the URL doesn't end with .pdf, try adding it 

66 try: 

67 parsed = urlparse(url) 

68 if not parsed.path.endswith(".pdf"): 

69 pdf_url = url.rstrip("/") + ".pdf" 

70 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

71 pdf_content = super()._download_pdf(pdf_url) 

72 else: 

73 pdf_content = None 

74 except: 

75 pdf_content = None 

76 

77 if pdf_content: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

79 return DownloadResult(content=pdf_content, is_success=True) 

80 

81 # Try to determine more specific reason 

82 try: 

83 response = self.session.get( 

84 url, timeout=5, allow_redirects=True, stream=True 

85 ) 

86 

87 # Check status code 

88 if response.status_code == 200: 

89 # Check if it's HTML instead of PDF 

90 content_type = response.headers.get( 

91 "content-type", "" 

92 ).lower() 

93 if "text/html" in content_type: 93 ↛ 98line 93 didn't jump to line 98 because the condition on line 93 was always true

94 return DownloadResult( 

95 skip_reason="Article page requires login or subscription - no direct PDF link available" 

96 ) 

97 else: 

98 return DownloadResult( 

99 skip_reason=f"Unexpected content type: {content_type} - expected PDF" 

100 ) 

101 elif response.status_code == 404: 

102 return DownloadResult( 

103 skip_reason="Article not found (404) - may have been removed or URL is incorrect" 

104 ) 

105 elif response.status_code == 403: 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was always true

106 return DownloadResult( 

107 skip_reason="Access denied (403) - article requires subscription or special permissions" 

108 ) 

109 elif response.status_code == 401: 

110 return DownloadResult( 

111 skip_reason="Authentication required - please login to access this article" 

112 ) 

113 elif response.status_code >= 500: 

114 return DownloadResult( 

115 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues" 

116 ) 

117 else: 

118 return DownloadResult( 

119 skip_reason=f"Unable to access article - server returned error code {response.status_code}" 

120 ) 

121 except requests.exceptions.Timeout: 

122 return DownloadResult( 

123 skip_reason="Connection timed out - server took too long to respond" 

124 ) 

125 except requests.exceptions.ConnectionError: 

126 return DownloadResult( 

127 skip_reason="Could not connect to server - website may be down" 

128 ) 

129 except: 

130 return DownloadResult( 

131 skip_reason="Network error - could not reach the website" 

132 ) 

133 

134 def _download_pdf(self, url: str) -> Optional[bytes]: 

135 """Attempt to download PDF from URL.""" 

136 logger.info(f"Attempting generic download from {url}") 

137 

138 # Try direct download 

139 pdf_content = super()._download_pdf(url) 

140 

141 if pdf_content: 

142 logger.info(f"Successfully downloaded PDF from {url}") 

143 return pdf_content 

144 

145 # If the URL doesn't end with .pdf, try adding it 

146 try: 

147 parsed = urlparse(url) 

148 if not parsed.path.endswith(".pdf"): 

149 pdf_url = url.rstrip("/") + ".pdf" 

150 logger.debug(f"Trying with .pdf extension: {pdf_url}") 

151 pdf_content = super()._download_pdf(pdf_url) 

152 else: 

153 pdf_content = None 

154 except: 

155 pdf_content = None 

156 

157 if pdf_content: 

158 logger.info(f"Successfully downloaded PDF from {pdf_url}") 

159 return pdf_content 

160 

161 logger.warning(f"Failed to download PDF from {url}") 

162 return None