Coverage for src / local_deep_research / web / services / pdf_extraction_service.py: 97%

49 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2PDF text extraction service. 

3 

4Provides efficient PDF text extraction with single-pass processing. 

5Complements pdf_service.py which handles PDF generation. 

6""" 

7 

8import io 

9from typing import Any, Dict, List 

10 

11import pdfplumber 

12from loguru import logger 

13 

14from local_deep_research.security.filename_sanitizer import sanitize_filename 

15 

16 

17class PDFExtractionService: 

18 """Service for extracting text and metadata from PDF files.""" 

19 

20 @staticmethod 

21 def extract_text_and_metadata( 

22 pdf_content: bytes, filename: str 

23 ) -> Dict[str, Any]: 

24 """ 

25 Extract text and metadata from PDF in a single pass. 

26 

27 This method opens the PDF only once and extracts both text content 

28 and metadata (page count) in the same operation, avoiding the 

29 performance issue of opening the file multiple times. 

30 

31 Args: 

32 pdf_content: Raw PDF file bytes 

33 filename: Original filename (for logging) 

34 

35 Returns: 

36 Dictionary with keys: 

37 - 'text': Extracted text content 

38 - 'pages': Number of pages 

39 - 'size': File size in bytes 

40 - 'filename': Original filename 

41 - 'success': Boolean indicating success 

42 - 'error': Error message if failed (None if successful) 

43 

44 Raises: 

45 No exceptions - errors are captured in return dict 

46 """ 

47 # Defense in depth: re-sanitize even though callers should 

48 # have sanitized already — guards against new callers that forget 

49 try: 

50 filename = sanitize_filename(filename) 

51 except Exception: 

52 filename = "unnamed.pdf" 

53 

54 try: 

55 with pdfplumber.open(io.BytesIO(pdf_content)) as pdf: 

56 # Get pages list once 

57 pages = list(pdf.pages) 

58 page_count = len(pages) 

59 

60 # Extract text from all pages in single pass 

61 text_parts = [] 

62 for page in pages: 

63 page_text = page.extract_text() 

64 if page_text: 

65 text_parts.append(page_text) 

66 

67 # Combine all text 

68 full_text = "\n".join(text_parts) 

69 

70 # Check if any text was extracted 

71 if not full_text.strip(): 

72 logger.warning(f"No extractable text found in {filename}") 

73 return { 

74 "text": "", 

75 "pages": page_count, 

76 "size": len(pdf_content), 

77 "filename": filename, 

78 "success": False, 

79 "error": "No extractable text found", 

80 } 

81 

82 logger.info( 

83 f"Successfully extracted text from {filename} " 

84 f"({len(full_text)} chars, {page_count} pages)" 

85 ) 

86 

87 return { 

88 "text": full_text.strip(), 

89 "pages": page_count, 

90 "size": len(pdf_content), 

91 "filename": filename, 

92 "success": True, 

93 "error": None, 

94 } 

95 

96 except Exception: 

97 # Log full exception details server-side for debugging 

98 logger.exception(f"Error extracting text from {filename}") 

99 # Return generic error message to avoid exposing internal details 

100 return { 

101 "text": "", 

102 "pages": 0, 

103 "size": len(pdf_content), 

104 "filename": filename, 

105 "success": False, 

106 "error": "Failed to extract text from PDF", 

107 } 

108 

109 @staticmethod 

110 def extract_batch(files_data: List[Dict[str, Any]]) -> Dict[str, Any]: 

111 """ 

112 Extract text from multiple PDF files. 

113 

114 Args: 

115 files_data: List of dicts with 'content' (bytes) and 'filename' (str) 

116 

117 Returns: 

118 Dictionary with: 

119 - 'results': List of extraction results 

120 - 'total_files': Total number of files processed 

121 - 'successful': Number of successfully processed files 

122 - 'failed': Number of failed files 

123 - 'errors': List of error messages 

124 """ 

125 results: list[Dict[str, Any]] = [] 

126 successful = 0 

127 failed = 0 

128 errors: list[str] = [] 

129 

130 for file_data in files_data: 

131 result = PDFExtractionService.extract_text_and_metadata( 

132 file_data["content"], file_data["filename"] 

133 ) 

134 

135 results.append(result) 

136 

137 if result["success"]: 

138 successful += 1 

139 else: 

140 failed += 1 

141 errors.append(f"{file_data['filename']}: {result['error']}") 

142 

143 return { 

144 "results": results, 

145 "total_files": len(files_data), 

146 "successful": successful, 

147 "failed": failed, 

148 "errors": errors, 

149 } 

150 

151 

152# Singleton pattern for service 

153_pdf_extraction_service = None 

154 

155 

156def get_pdf_extraction_service() -> PDFExtractionService: 

157 """Get the singleton PDF extraction service instance.""" 

158 global _pdf_extraction_service 

159 if _pdf_extraction_service is None: 

160 _pdf_extraction_service = PDFExtractionService() 

161 return _pdf_extraction_service