Coverage for src/local_deep_research/web/services/pdf_extraction

1"""

2PDF text extraction service.

4Provides efficient PDF text extraction with single-pass processing.

5Complements pdf_service.py which handles PDF generation.

6"""

8import io

9from typing import Dict, List

11import pdfplumber

12from loguru import logger

15class PDFExtractionService:

16 """Service for extracting text and metadata from PDF files."""

18 @staticmethod

19 def extract_text_and_metadata(

20 pdf_content: bytes, filename: str

21 ) -> Dict[str, any]:

22 """

23 Extract text and metadata from PDF in a single pass.

25 This method opens the PDF only once and extracts both text content

26 and metadata (page count) in the same operation, avoiding the

27 performance issue of opening the file multiple times.

29 Args:

30 pdf_content: Raw PDF file bytes

31 filename: Original filename (for logging)

33 Returns:

34 Dictionary with keys:

35 - 'text': Extracted text content

36 - 'pages': Number of pages

37 - 'size': File size in bytes

38 - 'filename': Original filename

39 - 'success': Boolean indicating success

40 - 'error': Error message if failed (None if successful)

42 Raises:

43 No exceptions - errors are captured in return dict

44 """

45 try:

46 with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:

47 # Get pages list once

48 pages = list(pdf.pages)

49 page_count = len(pages)

51 # Extract text from all pages in single pass

52 text_parts = []

53 for page in pages:

54 page_text = page.extract_text()

55 if page_text:

56 text_parts.append(page_text)

58 # Combine all text

59 full_text = "\n".join(text_parts)

61 # Check if any text was extracted

62 if not full_text.strip():

63 logger.warning(f"No extractable text found in {filename}")

64 return {

65 "text": "",

66 "pages": page_count,

67 "size": len(pdf_content),

68 "filename": filename,

69 "success": False,

70 "error": "No extractable text found",

71 }

73 logger.info(

74 f"Successfully extracted text from {filename} "

75 f"({len(full_text)} chars, {page_count} pages)"

76 )

78 return {

79 "text": full_text.strip(),

80 "pages": page_count,

81 "size": len(pdf_content),

82 "filename": filename,

83 "success": True,

84 "error": None,

85 }

87 except Exception:

88 # Log full exception details server-side for debugging

89 logger.exception(f"Error extracting text from {filename}")

90 # Return generic error message to avoid exposing internal details

91 return {

92 "text": "",

93 "pages": 0,

94 "size": len(pdf_content),

95 "filename": filename,

96 "success": False,

97 "error": "Failed to extract text from PDF",

98 }

100 @staticmethod

101 def extract_batch(files_data: List[Dict[str, any]]) -> Dict[str, any]:

102 """

103 Extract text from multiple PDF files.

104

105 Args:

106 files_data: List of dicts with 'content' (bytes) and 'filename' (str)

107

108 Returns:

109 Dictionary with:

110 - 'results': List of extraction results

111 - 'total_files': Total number of files processed

112 - 'successful': Number of successfully processed files

113 - 'failed': Number of failed files

114 - 'errors': List of error messages

115 """

116 results = []

117 successful = 0

118 failed = 0

119 errors = []

120

121 for file_data in files_data:

122 result = PDFExtractionService.extract_text_and_metadata(

123 file_data["content"], file_data["filename"]

124 )

125

126 results.append(result)

127

128 if result["success"]:

129 successful += 1

130 else:

131 failed += 1

132 errors.append(f"{file_data['filename']}: {result['error']}")

133

134 return {

135 "results": results,

136 "total_files": len(files_data),

137 "successful": successful,

138 "failed": failed,

139 "errors": errors,

140 }

141

142

143# Singleton pattern for service

144_pdf_extraction_service = None

145

146

147def get_pdf_extraction_service() -> PDFExtractionService:

148 """Get the singleton PDF extraction service instance."""

149 global _pdf_extraction_service

150 if _pdf_extraction_service is None:

151 _pdf_extraction_service = PDFExtractionService()

152 return _pdf_extraction_service

Coverage for src / local_deep_research / web / services / pdf_extraction_service.py: 100%

44 statements