Coverage for src/local_deep_research/research_library/downloaders/generic.py: 80%

1"""

2Generic PDF Downloader for unspecified sources

3"""

5from typing import Optional

6import requests

7from urllib.parse import urlparse

8from loguru import logger

10from .base import BaseDownloader, ContentType, DownloadResult

13class GenericDownloader(BaseDownloader):

14 """Generic downloader for any URL - attempts basic PDF download."""

16 def can_handle(self, url: str) -> bool:

17 """Generic downloader can handle any URL as a fallback."""

18 return True

20 def download(

21 self, url: str, content_type: ContentType = ContentType.PDF

22 ) -> Optional[bytes]:

23 """Attempt to download content from any URL."""

24 if content_type == ContentType.TEXT:

25 # For generic sources, we can only extract text from PDF

26 pdf_content = self._download_pdf(url)

27 if pdf_content: 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was always true

28 text = self.extract_text_from_pdf(pdf_content)

29 if text: 29 ↛ 31line 29 didn't jump to line 31 because the condition on line 29 was always true

30 return text.encode("utf-8")

31 return None

32 else:

33 # Try to download as PDF

34 return self._download_pdf(url)

36 def download_with_result(

37 self, url: str, content_type: ContentType = ContentType.PDF

38 ) -> DownloadResult:

39 """Download content and return detailed result with skip reason."""

40 if content_type == ContentType.TEXT:

41 # For generic sources, we can only extract text from PDF

42 pdf_content = self._download_pdf(url)

43 if pdf_content:

44 text = self.extract_text_from_pdf(pdf_content)

45 if text:

46 return DownloadResult(

47 content=text.encode("utf-8"), is_success=True

48 )

49 else:

50 return DownloadResult(

51 skip_reason="PDF downloaded but text extraction failed"

52 )

53 return DownloadResult(skip_reason="Could not download PDF from URL")

54 else:

55 # Try to download as PDF

56 logger.info(f"Attempting generic download from {url}")

58 # Try direct download

59 pdf_content = super()._download_pdf(url)

61 if pdf_content:

62 logger.info(f"Successfully downloaded PDF from {url}")

63 return DownloadResult(content=pdf_content, is_success=True)

65 # If the URL doesn't end with .pdf, try adding it

66 try:

67 parsed = urlparse(url)

68 if not parsed.path.endswith(".pdf"):

69 pdf_url = url.rstrip("/") + ".pdf"

70 logger.debug(f"Trying with .pdf extension: {pdf_url}")

71 pdf_content = super()._download_pdf(pdf_url)

72 else:

73 pdf_content = None

74 except (ValueError, AttributeError):

75 # urlparse can raise ValueError for malformed URLs

76 pdf_content = None

78 if pdf_content: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 logger.info(f"Successfully downloaded PDF from {pdf_url}")

80 return DownloadResult(content=pdf_content, is_success=True)

82 # Try to determine more specific reason

83 try:

84 response = self.session.get(

85 url, timeout=5, allow_redirects=True, stream=True

86 )

88 # Check status code

89 if response.status_code == 200:

90 # Check if it's HTML instead of PDF

91 content_type = response.headers.get(

92 "content-type", ""

93 ).lower()

94 if "text/html" in content_type: 94 ↛ 99line 94 didn't jump to line 99 because the condition on line 94 was always true

95 return DownloadResult(

96 skip_reason="Article page requires login or subscription - no direct PDF link available"

97 )

98 else:

99 return DownloadResult(

100 skip_reason=f"Unexpected content type: {content_type} - expected PDF"

101 )

102 elif response.status_code == 404:

103 return DownloadResult(

104 skip_reason="Article not found (404) - may have been removed or URL is incorrect"

105 )

106 elif response.status_code == 403: 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was always true

107 return DownloadResult(

108 skip_reason="Access denied (403) - article requires subscription or special permissions"

109 )

110 elif response.status_code == 401:

111 return DownloadResult(

112 skip_reason="Authentication required - please login to access this article"

113 )

114 elif response.status_code >= 500:

115 return DownloadResult(

116 skip_reason=f"Server error ({response.status_code}) - website is experiencing technical issues"

117 )

118 else:

119 return DownloadResult(

120 skip_reason=f"Unable to access article - server returned error code {response.status_code}"

121 )

122 except requests.exceptions.Timeout:

123 return DownloadResult(

124 skip_reason="Connection timed out - server took too long to respond"

125 )

126 except requests.exceptions.ConnectionError:

127 return DownloadResult(

128 skip_reason="Could not connect to server - website may be down"

129 )

130 except requests.RequestException:

131 logger.warning("Unexpected error checking URL: %s", url)

132 return DownloadResult(

133 skip_reason="Network error - could not reach the website"

134 )

135

136 def _download_pdf(self, url: str) -> Optional[bytes]:

137 """Attempt to download PDF from URL."""

138 logger.info(f"Attempting generic download from {url}")

139

140 # Try direct download

141 pdf_content = super()._download_pdf(url)

142

143 if pdf_content:

144 logger.info(f"Successfully downloaded PDF from {url}")

145 return pdf_content

146

147 # If the URL doesn't end with .pdf, try adding it

148 try:

149 parsed = urlparse(url)

150 if not parsed.path.endswith(".pdf"):

151 pdf_url = url.rstrip("/") + ".pdf"

152 logger.debug(f"Trying with .pdf extension: {pdf_url}")

153 pdf_content = super()._download_pdf(pdf_url)

154 else:

155 pdf_content = None

156 except (ValueError, AttributeError):

157 # urlparse can raise ValueError for malformed URLs

158 pdf_content = None

159

160 if pdf_content:

161 logger.info(f"Successfully downloaded PDF from {pdf_url}")

162 return pdf_content

163

164 logger.warning(f"Failed to download PDF from {url}")

165 return None

Coverage for src / local_deep_research / research_library / downloaders / generic.py: 80%

86 statements