Coverage for src / local_deep_research / web / services / pdf_extraction_service.py: 97%
49 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2PDF text extraction service.
4Provides efficient PDF text extraction with single-pass processing.
5Complements pdf_service.py which handles PDF generation.
6"""
8import io
9from typing import Any, Dict, List
11import pdfplumber
12from loguru import logger
14from local_deep_research.security.filename_sanitizer import sanitize_filename
17class PDFExtractionService:
18 """Service for extracting text and metadata from PDF files."""
20 @staticmethod
21 def extract_text_and_metadata(
22 pdf_content: bytes, filename: str
23 ) -> Dict[str, Any]:
24 """
25 Extract text and metadata from PDF in a single pass.
27 This method opens the PDF only once and extracts both text content
28 and metadata (page count) in the same operation, avoiding the
29 performance issue of opening the file multiple times.
31 Args:
32 pdf_content: Raw PDF file bytes
33 filename: Original filename (for logging)
35 Returns:
36 Dictionary with keys:
37 - 'text': Extracted text content
38 - 'pages': Number of pages
39 - 'size': File size in bytes
40 - 'filename': Original filename
41 - 'success': Boolean indicating success
42 - 'error': Error message if failed (None if successful)
44 Raises:
45 No exceptions - errors are captured in return dict
46 """
47 # Defense in depth: re-sanitize even though callers should
48 # have sanitized already — guards against new callers that forget
49 try:
50 filename = sanitize_filename(filename)
51 except Exception:
52 filename = "unnamed.pdf"
54 try:
55 with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
56 # Get pages list once
57 pages = list(pdf.pages)
58 page_count = len(pages)
60 # Extract text from all pages in single pass
61 text_parts = []
62 for page in pages:
63 page_text = page.extract_text()
64 if page_text:
65 text_parts.append(page_text)
67 # Combine all text
68 full_text = "\n".join(text_parts)
70 # Check if any text was extracted
71 if not full_text.strip():
72 logger.warning(f"No extractable text found in {filename}")
73 return {
74 "text": "",
75 "pages": page_count,
76 "size": len(pdf_content),
77 "filename": filename,
78 "success": False,
79 "error": "No extractable text found",
80 }
82 logger.info(
83 f"Successfully extracted text from {filename} "
84 f"({len(full_text)} chars, {page_count} pages)"
85 )
87 return {
88 "text": full_text.strip(),
89 "pages": page_count,
90 "size": len(pdf_content),
91 "filename": filename,
92 "success": True,
93 "error": None,
94 }
96 except Exception:
97 # Log full exception details server-side for debugging
98 logger.exception(f"Error extracting text from {filename}")
99 # Return generic error message to avoid exposing internal details
100 return {
101 "text": "",
102 "pages": 0,
103 "size": len(pdf_content),
104 "filename": filename,
105 "success": False,
106 "error": "Failed to extract text from PDF",
107 }
109 @staticmethod
110 def extract_batch(files_data: List[Dict[str, Any]]) -> Dict[str, Any]:
111 """
112 Extract text from multiple PDF files.
114 Args:
115 files_data: List of dicts with 'content' (bytes) and 'filename' (str)
117 Returns:
118 Dictionary with:
119 - 'results': List of extraction results
120 - 'total_files': Total number of files processed
121 - 'successful': Number of successfully processed files
122 - 'failed': Number of failed files
123 - 'errors': List of error messages
124 """
125 results: list[Dict[str, Any]] = []
126 successful = 0
127 failed = 0
128 errors: list[str] = []
130 for file_data in files_data:
131 result = PDFExtractionService.extract_text_and_metadata(
132 file_data["content"], file_data["filename"]
133 )
135 results.append(result)
137 if result["success"]:
138 successful += 1
139 else:
140 failed += 1
141 errors.append(f"{file_data['filename']}: {result['error']}")
143 return {
144 "results": results,
145 "total_files": len(files_data),
146 "successful": successful,
147 "failed": failed,
148 "errors": errors,
149 }
152# Singleton pattern for service
153_pdf_extraction_service = None
156def get_pdf_extraction_service() -> PDFExtractionService:
157 """Get the singleton PDF extraction service instance."""
158 global _pdf_extraction_service
159 if _pdf_extraction_service is None:
160 _pdf_extraction_service = PDFExtractionService()
161 return _pdf_extraction_service