Coverage for src / local_deep_research / web / services / pdf_extraction_service.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2PDF text extraction service. 

3 

4Provides efficient PDF text extraction with single-pass processing. 

5Complements pdf_service.py which handles PDF generation. 

6""" 

7 

8import io 

9from typing import Dict, List 

10 

11import pdfplumber 

12from loguru import logger 

13 

14 

15class PDFExtractionService: 

16 """Service for extracting text and metadata from PDF files.""" 

17 

18 @staticmethod 

19 def extract_text_and_metadata( 

20 pdf_content: bytes, filename: str 

21 ) -> Dict[str, any]: 

22 """ 

23 Extract text and metadata from PDF in a single pass. 

24 

25 This method opens the PDF only once and extracts both text content 

26 and metadata (page count) in the same operation, avoiding the 

27 performance issue of opening the file multiple times. 

28 

29 Args: 

30 pdf_content: Raw PDF file bytes 

31 filename: Original filename (for logging) 

32 

33 Returns: 

34 Dictionary with keys: 

35 - 'text': Extracted text content 

36 - 'pages': Number of pages 

37 - 'size': File size in bytes 

38 - 'filename': Original filename 

39 - 'success': Boolean indicating success 

40 - 'error': Error message if failed (None if successful) 

41 

42 Raises: 

43 No exceptions - errors are captured in return dict 

44 """ 

45 try: 

46 with pdfplumber.open(io.BytesIO(pdf_content)) as pdf: 

47 # Get pages list once 

48 pages = list(pdf.pages) 

49 page_count = len(pages) 

50 

51 # Extract text from all pages in single pass 

52 text_parts = [] 

53 for page in pages: 

54 page_text = page.extract_text() 

55 if page_text: 

56 text_parts.append(page_text) 

57 

58 # Combine all text 

59 full_text = "\n".join(text_parts) 

60 

61 # Check if any text was extracted 

62 if not full_text.strip(): 

63 logger.warning(f"No extractable text found in {filename}") 

64 return { 

65 "text": "", 

66 "pages": page_count, 

67 "size": len(pdf_content), 

68 "filename": filename, 

69 "success": False, 

70 "error": "No extractable text found", 

71 } 

72 

73 logger.info( 

74 f"Successfully extracted text from {filename} " 

75 f"({len(full_text)} chars, {page_count} pages)" 

76 ) 

77 

78 return { 

79 "text": full_text.strip(), 

80 "pages": page_count, 

81 "size": len(pdf_content), 

82 "filename": filename, 

83 "success": True, 

84 "error": None, 

85 } 

86 

87 except Exception: 

88 # Log full exception details server-side for debugging 

89 logger.exception(f"Error extracting text from {filename}") 

90 # Return generic error message to avoid exposing internal details 

91 return { 

92 "text": "", 

93 "pages": 0, 

94 "size": len(pdf_content), 

95 "filename": filename, 

96 "success": False, 

97 "error": "Failed to extract text from PDF", 

98 } 

99 

100 @staticmethod 

101 def extract_batch(files_data: List[Dict[str, any]]) -> Dict[str, any]: 

102 """ 

103 Extract text from multiple PDF files. 

104 

105 Args: 

106 files_data: List of dicts with 'content' (bytes) and 'filename' (str) 

107 

108 Returns: 

109 Dictionary with: 

110 - 'results': List of extraction results 

111 - 'total_files': Total number of files processed 

112 - 'successful': Number of successfully processed files 

113 - 'failed': Number of failed files 

114 - 'errors': List of error messages 

115 """ 

116 results = [] 

117 successful = 0 

118 failed = 0 

119 errors = [] 

120 

121 for file_data in files_data: 

122 result = PDFExtractionService.extract_text_and_metadata( 

123 file_data["content"], file_data["filename"] 

124 ) 

125 

126 results.append(result) 

127 

128 if result["success"]: 

129 successful += 1 

130 else: 

131 failed += 1 

132 errors.append(f"{file_data['filename']}: {result['error']}") 

133 

134 return { 

135 "results": results, 

136 "total_files": len(files_data), 

137 "successful": successful, 

138 "failed": failed, 

139 "errors": errors, 

140 } 

141 

142 

143# Singleton pattern for service 

144_pdf_extraction_service = None 

145 

146 

147def get_pdf_extraction_service() -> PDFExtractionService: 

148 """Get the singleton PDF extraction service instance.""" 

149 global _pdf_extraction_service 

150 if _pdf_extraction_service is None: 

151 _pdf_extraction_service = PDFExtractionService() 

152 return _pdf_extraction_service