Coverage for src / local_deep_research / web / services / pdf_extraction_service.py: 100%
44 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2PDF text extraction service.
4Provides efficient PDF text extraction with single-pass processing.
5Complements pdf_service.py which handles PDF generation.
6"""
8import io
9from typing import Dict, List
11import pdfplumber
12from loguru import logger
15class PDFExtractionService:
16 """Service for extracting text and metadata from PDF files."""
18 @staticmethod
19 def extract_text_and_metadata(
20 pdf_content: bytes, filename: str
21 ) -> Dict[str, any]:
22 """
23 Extract text and metadata from PDF in a single pass.
25 This method opens the PDF only once and extracts both text content
26 and metadata (page count) in the same operation, avoiding the
27 performance issue of opening the file multiple times.
29 Args:
30 pdf_content: Raw PDF file bytes
31 filename: Original filename (for logging)
33 Returns:
34 Dictionary with keys:
35 - 'text': Extracted text content
36 - 'pages': Number of pages
37 - 'size': File size in bytes
38 - 'filename': Original filename
39 - 'success': Boolean indicating success
40 - 'error': Error message if failed (None if successful)
42 Raises:
43 No exceptions - errors are captured in return dict
44 """
45 try:
46 with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
47 # Get pages list once
48 pages = list(pdf.pages)
49 page_count = len(pages)
51 # Extract text from all pages in single pass
52 text_parts = []
53 for page in pages:
54 page_text = page.extract_text()
55 if page_text:
56 text_parts.append(page_text)
58 # Combine all text
59 full_text = "\n".join(text_parts)
61 # Check if any text was extracted
62 if not full_text.strip():
63 logger.warning(f"No extractable text found in {filename}")
64 return {
65 "text": "",
66 "pages": page_count,
67 "size": len(pdf_content),
68 "filename": filename,
69 "success": False,
70 "error": "No extractable text found",
71 }
73 logger.info(
74 f"Successfully extracted text from {filename} "
75 f"({len(full_text)} chars, {page_count} pages)"
76 )
78 return {
79 "text": full_text.strip(),
80 "pages": page_count,
81 "size": len(pdf_content),
82 "filename": filename,
83 "success": True,
84 "error": None,
85 }
87 except Exception:
88 # Log full exception details server-side for debugging
89 logger.exception(f"Error extracting text from {filename}")
90 # Return generic error message to avoid exposing internal details
91 return {
92 "text": "",
93 "pages": 0,
94 "size": len(pdf_content),
95 "filename": filename,
96 "success": False,
97 "error": "Failed to extract text from PDF",
98 }
100 @staticmethod
101 def extract_batch(files_data: List[Dict[str, any]]) -> Dict[str, any]:
102 """
103 Extract text from multiple PDF files.
105 Args:
106 files_data: List of dicts with 'content' (bytes) and 'filename' (str)
108 Returns:
109 Dictionary with:
110 - 'results': List of extraction results
111 - 'total_files': Total number of files processed
112 - 'successful': Number of successfully processed files
113 - 'failed': Number of failed files
114 - 'errors': List of error messages
115 """
116 results = []
117 successful = 0
118 failed = 0
119 errors = []
121 for file_data in files_data:
122 result = PDFExtractionService.extract_text_and_metadata(
123 file_data["content"], file_data["filename"]
124 )
126 results.append(result)
128 if result["success"]:
129 successful += 1
130 else:
131 failed += 1
132 errors.append(f"{file_data['filename']}: {result['error']}")
134 return {
135 "results": results,
136 "total_files": len(files_data),
137 "successful": successful,
138 "failed": failed,
139 "errors": errors,
140 }
143# Singleton pattern for service
144_pdf_extraction_service = None
147def get_pdf_extraction_service() -> PDFExtractionService:
148 """Get the singleton PDF extraction service instance."""
149 global _pdf_extraction_service
150 if _pdf_extraction_service is None:
151 _pdf_extraction_service = PDFExtractionService()
152 return _pdf_extraction_service