Coverage for src/local_deep_research/research_library/downloaders/direct

1"""

2Direct PDF Link Downloader

3"""

5from typing import Optional

6from urllib.parse import urlparse

7from loguru import logger

9from .base import BaseDownloader, ContentType, DownloadResult

12class DirectPDFDownloader(BaseDownloader):

13 """Downloader for direct PDF links."""

15 def can_handle(self, url: str) -> bool:

16 """Check if URL is a direct PDF link using proper URL parsing."""

17 try:

18 # Parse the URL

19 parsed = urlparse(url.lower())

20 path = parsed.path or ""

21 query = parsed.query or ""

23 # Check for .pdf extension in path

24 if path.endswith(".pdf"):

25 return True

27 # Check for .pdf with query parameters

28 if ".pdf?" in url.lower(): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 return True

31 # Check for /pdf/ in path

32 if "/pdf/" in path: 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 return True

35 # Check query parameters for PDF format

36 if "type=pdf" in query or "format=pdf" in query: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 return True

39 return False

41 except Exception as e:

42 logger.warning(f"Error parsing URL {url}: {e}")

43 return False

45 def download(

46 self, url: str, content_type: ContentType = ContentType.PDF

47 ) -> Optional[bytes]:

48 """Download PDF directly from URL."""

49 if content_type == ContentType.TEXT: 49 ↛ 51line 49 didn't jump to line 51 because the condition on line 49 was never true

50 # Download PDF and extract text

51 pdf_content = self._download_pdf(url)

52 if pdf_content:

53 text = self.extract_text_from_pdf(pdf_content)

54 if text:

55 return text.encode("utf-8")

56 return None

57 else:

58 return self._download_pdf(url)

60 def download_with_result(

61 self, url: str, content_type: ContentType = ContentType.PDF

62 ) -> DownloadResult:

63 """Download content and return detailed result with skip reason."""

64 if content_type == ContentType.TEXT: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was never true

65 # Download PDF and extract text

66 pdf_content = self._download_pdf(url)

67 if pdf_content:

68 text = self.extract_text_from_pdf(pdf_content)

69 if text:

70 return DownloadResult(

71 content=text.encode("utf-8"), is_success=True

72 )

73 else:

74 return DownloadResult(

75 skip_reason="PDF downloaded but text extraction failed"

76 )

77 return DownloadResult(

78 skip_reason="Could not download PDF from direct link"

79 )

80 else:

81 # Try to download PDF directly

82 logger.info(f"Attempting direct PDF download from {url}")

83 pdf_content = super()._download_pdf(url)

85 if pdf_content:

86 logger.info(f"Successfully downloaded PDF directly from {url}")

87 return DownloadResult(content=pdf_content, is_success=True)

88 else:

89 # Try to determine specific reason for failure

90 try:

91 response = self.session.head(

92 url, timeout=5, allow_redirects=True

93 )

94 if response.status_code == 404:

95 return DownloadResult(

96 skip_reason="PDF file not found (404) - link may be broken"

97 )

98 elif response.status_code == 403:

99 return DownloadResult(

100 skip_reason="Access denied (403) - PDF requires authentication"

101 )

102 elif response.status_code >= 500:

103 return DownloadResult(

104 skip_reason=f"Server error ({response.status_code}) - try again later"

105 )

106 else:

107 return DownloadResult(

108 skip_reason=f"Could not download PDF - server returned status {response.status_code}"

109 )

110 except Exception:

111 return DownloadResult(

112 skip_reason="Failed to download PDF from direct link"

113 )

114

115 def _download_pdf(self, url: str) -> Optional[bytes]:

116 """Download PDF directly from URL."""

117 logger.info(f"Downloading PDF directly from: {url}")

118 return super()._download_pdf(url)

Coverage for src / local_deep_research / research_library / downloaders / direct_pdf.py: 48%

59 statements