Coverage for src / local_deep_research / research_library / downloaders / direct_pdf.py: 98%

59 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Direct PDF Link Downloader 

3""" 

4 

5from typing import Dict, Optional 

6from urllib.parse import urlparse 

7from loguru import logger 

8 

9from .base import BaseDownloader, ContentType, DownloadResult 

10 

11 

12class DirectPDFDownloader(BaseDownloader): 

13 """Downloader for direct PDF links.""" 

14 

15 def can_handle(self, url: str) -> bool: 

16 """Check if URL is a direct PDF link using proper URL parsing.""" 

17 try: 

18 # Parse the URL 

19 parsed = urlparse(url.lower()) 

20 path = parsed.path or "" 

21 query = parsed.query or "" 

22 

23 # Check for .pdf extension in path 

24 if path.endswith(".pdf"): 

25 return True 

26 

27 # Check for .pdf with query parameters 

28 if ".pdf?" in url.lower(): 28 ↛ 29line 28 didn't jump to line 29 because the condition on line 28 was never true

29 return True 

30 

31 # Check for /pdf/ in path 

32 if "/pdf/" in path: 

33 return True 

34 

35 # Check query parameters for PDF format 

36 if "type=pdf" in query or "format=pdf" in query: 

37 return True 

38 

39 return False 

40 

41 except Exception: 

42 logger.warning(f"Error parsing URL {url}") 

43 return False 

44 

45 def download( 

46 self, url: str, content_type: ContentType = ContentType.PDF 

47 ) -> Optional[bytes]: 

48 """Download PDF directly from URL.""" 

49 if content_type == ContentType.TEXT: 

50 # Download PDF and extract text 

51 pdf_content = self._download_pdf(url) 

52 if pdf_content: 

53 text = self.extract_text_from_pdf(pdf_content) 

54 if text: 

55 return text.encode("utf-8") 

56 return None 

57 return self._download_pdf(url) 

58 

59 def download_with_result( 

60 self, url: str, content_type: ContentType = ContentType.PDF 

61 ) -> DownloadResult: 

62 """Download content and return detailed result with skip reason.""" 

63 if content_type == ContentType.TEXT: 

64 # Download PDF and extract text 

65 pdf_content = self._download_pdf(url) 

66 if pdf_content: 

67 text = self.extract_text_from_pdf(pdf_content) 

68 if text: 

69 return DownloadResult( 

70 content=text.encode("utf-8"), is_success=True 

71 ) 

72 return DownloadResult( 

73 skip_reason="PDF downloaded but text extraction failed" 

74 ) 

75 return DownloadResult( 

76 skip_reason="Could not download PDF from direct link" 

77 ) 

78 # Try to download PDF directly 

79 logger.info(f"Attempting direct PDF download from {url}") 

80 pdf_content = super()._download_pdf(url) 

81 

82 if pdf_content: 

83 logger.info(f"Successfully downloaded PDF directly from {url}") 

84 return DownloadResult(content=pdf_content, is_success=True) 

85 # Try to determine specific reason for failure 

86 try: 

87 response = self.session.head(url, timeout=5, allow_redirects=True) 

88 if response.status_code == 404: 

89 return DownloadResult( 

90 skip_reason="PDF file not found (404) - link may be broken", 

91 status_code=404, 

92 ) 

93 if response.status_code == 403: 

94 return DownloadResult( 

95 skip_reason="Access denied (403) - PDF requires authentication", 

96 status_code=403, 

97 ) 

98 if response.status_code >= 500: 

99 return DownloadResult( 

100 skip_reason=f"Server error ({response.status_code}) - try again later", 

101 status_code=response.status_code, 

102 ) 

103 return DownloadResult( 

104 skip_reason=f"Could not download PDF - server returned status {response.status_code}", 

105 status_code=response.status_code, 

106 ) 

107 except Exception: 

108 return DownloadResult( 

109 skip_reason="Failed to download PDF from direct link" 

110 ) 

111 

112 def _download_pdf( 

113 self, url: str, headers: Optional[Dict[str, str]] = None 

114 ) -> Optional[bytes]: 

115 """Download PDF directly from URL.""" 

116 logger.info(f"Downloading PDF directly from: {url}") 

117 return super()._download_pdf(url)