Coverage for src / local_deep_research / web / services / pdf_service.py: 100%
45 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 07:37 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 07:37 +0000
1"""
2PDF generation service using WeasyPrint.
4Based on deep research findings, WeasyPrint is the optimal choice for
5production Flask applications due to:
6- Pure Python (no external binaries except Pango)
7- Modern CSS3 support
8- Active maintenance (v66.0 as of July 2025)
9- Good paged media features
10"""
12import io
13from typing import Optional, Dict, Any
14import markdown
15from weasyprint import HTML, CSS
16from loguru import logger
19class PDFService:
20 """Service for converting markdown to PDF using WeasyPrint."""
22 def __init__(self):
23 """Initialize PDF service with minimal CSS for readability."""
24 self.minimal_css = CSS(
25 string="""
26 @page {
27 size: A4;
28 margin: 1.5cm;
29 }
31 body {
32 font-family: Arial, sans-serif;
33 font-size: 10pt;
34 line-height: 1.4;
35 }
37 table {
38 border-collapse: collapse;
39 width: 100%;
40 margin: 0.5em 0;
41 }
43 th, td {
44 border: 1px solid #ccc;
45 padding: 6px;
46 text-align: left;
47 }
49 th {
50 background-color: #f0f0f0;
51 }
53 h1 { font-size: 16pt; margin: 0.5em 0; }
54 h2 { font-size: 14pt; margin: 0.5em 0; }
55 h3 { font-size: 12pt; margin: 0.5em 0; }
56 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; }
57 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; }
58 h6 { font-size: 10pt; margin: 0.5em 0; }
60 code {
61 font-family: monospace;
62 background-color: #f5f5f5;
63 padding: 1px 3px;
64 }
66 pre {
67 background-color: #f5f5f5;
68 padding: 8px;
69 overflow-x: auto;
70 }
72 a {
73 color: #0066cc;
74 text-decoration: none;
75 }
76 """
77 )
79 def markdown_to_pdf(
80 self,
81 markdown_content: str,
82 title: Optional[str] = None,
83 metadata: Optional[Dict[str, Any]] = None,
84 custom_css: Optional[str] = None,
85 ) -> bytes:
86 """
87 Convert markdown content to PDF.
89 Args:
90 markdown_content: The markdown text to convert
91 title: Optional title for the document
92 metadata: Optional metadata dict (author, date, etc.)
93 custom_css: Optional CSS string to override defaults
95 Returns:
96 PDF file as bytes
98 Note:
99 WeasyPrint memory usage can spike with large documents.
100 Production deployments should implement:
101 - Memory limits (ulimit)
102 - Timeouts (30-60 seconds)
103 - Worker recycling after 100 requests
104 """
105 try:
106 # Convert markdown to HTML
107 html_content = self._markdown_to_html(
108 markdown_content, title, metadata
109 )
111 # Create HTML document with WeasyPrint
112 html_doc = HTML(string=html_content)
114 # Apply CSS (custom or minimal default)
115 css_list = []
116 if custom_css:
117 css_list.append(CSS(string=custom_css))
118 else:
119 css_list.append(self.minimal_css)
121 # Generate PDF
122 # Use BytesIO to get bytes instead of writing to file
123 pdf_buffer = io.BytesIO()
124 html_doc.write_pdf(pdf_buffer, stylesheets=css_list)
126 # Get the PDF bytes
127 pdf_bytes = pdf_buffer.getvalue()
128 pdf_buffer.close()
130 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes")
131 return pdf_bytes
133 except Exception as e:
134 logger.exception(f"Error generating PDF: {str(e)}")
135 raise
137 def _markdown_to_html(
138 self,
139 markdown_content: str,
140 title: Optional[str] = None,
141 metadata: Optional[Dict[str, Any]] = None,
142 ) -> str:
143 """
144 Convert markdown to HTML with proper structure.
146 Uses Python-Markdown with extensions for:
147 - Tables
148 - Fenced code blocks
149 - Table of contents
150 - Footnotes
151 """
152 # Parse markdown with extensions
153 md = markdown.Markdown(
154 extensions=[
155 "tables",
156 "fenced_code",
157 "footnotes",
158 "toc",
159 "nl2br", # Convert newlines to <br>
160 "sane_lists",
161 "meta",
162 ]
163 )
165 html_body = md.convert(markdown_content)
167 # Build complete HTML document
168 html_parts = ["<!DOCTYPE html><html><head>"]
169 html_parts.append('<meta charset="utf-8">')
171 if title:
172 html_parts.append(f"<title>{title}</title>")
174 if metadata:
175 for key, value in metadata.items():
176 html_parts.append(f'<meta name="{key}" content="{value}">')
178 html_parts.append("</head><body>")
180 # Add the markdown content directly without any extra title or metadata
181 html_parts.append(html_body)
183 # Add footer with LDR attribution
184 html_parts.append("""
185 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;">
186 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant
187 </div>
188 """)
190 html_parts.append("</body></html>")
192 return "".join(html_parts)
195# Singleton instance
196_pdf_service = None
199def get_pdf_service() -> PDFService:
200 """Get or create the PDF service singleton."""
201 global _pdf_service
202 if _pdf_service is None:
203 _pdf_service = PDFService()
204 return _pdf_service