Coverage for src / local_deep_research / research_library / downloaders / direct_pdf.py: 49%

59 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Direct PDF Link Downloader 

3""" 

4 

5from typing import Optional 

6from urllib.parse import urlparse 

7from loguru import logger 

8 

9from .base import BaseDownloader, ContentType, DownloadResult 

10 

11 

12class DirectPDFDownloader(BaseDownloader): 

13 """Downloader for direct PDF links.""" 

14 

15 def can_handle(self, url: str) -> bool: 

16 """Check if URL is a direct PDF link using proper URL parsing.""" 

17 try: 

18 # Parse the URL 

19 parsed = urlparse(url.lower()) 

20 path = parsed.path or "" 

21 query = parsed.query or "" 

22 

23 # Check for .pdf extension in path 

24 if path.endswith(".pdf"): 

25 return True 

26 

27 # Check for .pdf with query parameters 

28 if ".pdf?" in url.lower(): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 return True 

30 

31 # Check for /pdf/ in path 

32 if "/pdf/" in path: 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true

33 return True 

34 

35 # Check query parameters for PDF format 

36 if "type=pdf" in query or "format=pdf" in query: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 return True 

38 

39 return False 

40 

41 except Exception as e: 

42 logger.warning(f"Error parsing URL {url}: {e}") 

43 return False 

44 

45 def download( 

46 self, url: str, content_type: ContentType = ContentType.PDF 

47 ) -> Optional[bytes]: 

48 """Download PDF directly from URL.""" 

49 if content_type == ContentType.TEXT: 49 ↛ 51line 49 didn't jump to line 51 because the condition on line 49 was never true

50 # Download PDF and extract text 

51 pdf_content = self._download_pdf(url) 

52 if pdf_content: 

53 text = self.extract_text_from_pdf(pdf_content) 

54 if text: 

55 return text.encode("utf-8") 

56 return None 

57 else: 

58 return self._download_pdf(url) 

59 

60 def download_with_result( 

61 self, url: str, content_type: ContentType = ContentType.PDF 

62 ) -> DownloadResult: 

63 """Download content and return detailed result with skip reason.""" 

64 if content_type == ContentType.TEXT: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was never true

65 # Download PDF and extract text 

66 pdf_content = self._download_pdf(url) 

67 if pdf_content: 

68 text = self.extract_text_from_pdf(pdf_content) 

69 if text: 

70 return DownloadResult( 

71 content=text.encode("utf-8"), is_success=True 

72 ) 

73 else: 

74 return DownloadResult( 

75 skip_reason="PDF downloaded but text extraction failed" 

76 ) 

77 return DownloadResult( 

78 skip_reason="Could not download PDF from direct link" 

79 ) 

80 else: 

81 # Try to download PDF directly 

82 logger.info(f"Attempting direct PDF download from {url}") 

83 pdf_content = super()._download_pdf(url) 

84 

85 if pdf_content: 

86 logger.info(f"Successfully downloaded PDF directly from {url}") 

87 return DownloadResult(content=pdf_content, is_success=True) 

88 else: 

89 # Try to determine specific reason for failure 

90 try: 

91 response = self.session.head( 

92 url, timeout=5, allow_redirects=True 

93 ) 

94 if response.status_code == 404: 94 ↛ 98line 94 didn't jump to line 98 because the condition on line 94 was always true

95 return DownloadResult( 

96 skip_reason="PDF file not found (404) - link may be broken" 

97 ) 

98 elif response.status_code == 403: 

99 return DownloadResult( 

100 skip_reason="Access denied (403) - PDF requires authentication" 

101 ) 

102 elif response.status_code >= 500: 

103 return DownloadResult( 

104 skip_reason=f"Server error ({response.status_code}) - try again later" 

105 ) 

106 else: 

107 return DownloadResult( 

108 skip_reason=f"Could not download PDF - server returned status {response.status_code}" 

109 ) 

110 except Exception: 

111 return DownloadResult( 

112 skip_reason="Failed to download PDF from direct link" 

113 ) 

114 

115 def _download_pdf(self, url: str) -> Optional[bytes]: 

116 """Download PDF directly from URL.""" 

117 logger.info(f"Downloading PDF directly from: {url}") 

118 return super()._download_pdf(url)