Coverage for src / local_deep_research / web / services / pdf_service.py: 100%
46 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2PDF generation service using WeasyPrint.
4Based on deep research findings, WeasyPrint is the optimal choice for
5production Flask applications due to:
6- Pure Python (no external binaries except Pango)
7- Modern CSS3 support
8- Active maintenance (v66.0 as of July 2025)
9- Good paged media features
10"""
12import io
13from html import escape
14from typing import Optional, Dict, Any
15import markdown # type: ignore[import-untyped]
16from weasyprint import HTML, CSS
17from loguru import logger
20class PDFService:
21 """Service for converting markdown to PDF using WeasyPrint."""
23 def __init__(self):
24 """Initialize PDF service with minimal CSS for readability."""
25 self.minimal_css = CSS(
26 string="""
27 @page {
28 size: A4;
29 margin: 1.5cm;
30 }
32 body {
33 font-family: Arial, sans-serif;
34 font-size: 10pt;
35 line-height: 1.4;
36 }
38 table {
39 border-collapse: collapse;
40 width: 100%;
41 margin: 0.5em 0;
42 }
44 th, td {
45 border: 1px solid #ccc;
46 padding: 6px;
47 text-align: left;
48 }
50 th {
51 background-color: #f0f0f0;
52 }
54 h1 { font-size: 16pt; margin: 0.5em 0; }
55 h2 { font-size: 14pt; margin: 0.5em 0; }
56 h3 { font-size: 12pt; margin: 0.5em 0; }
57 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; }
58 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; }
59 h6 { font-size: 10pt; margin: 0.5em 0; }
61 code {
62 font-family: monospace;
63 background-color: #f5f5f5;
64 padding: 1px 3px;
65 }
67 pre {
68 background-color: #f5f5f5;
69 padding: 8px;
70 overflow-x: auto;
71 }
73 a {
74 color: #0066cc;
75 text-decoration: none;
76 }
77 """
78 )
80 def markdown_to_pdf(
81 self,
82 markdown_content: str,
83 title: Optional[str] = None,
84 metadata: Optional[Dict[str, Any]] = None,
85 custom_css: Optional[str] = None,
86 ) -> bytes:
87 """
88 Convert markdown content to PDF.
90 Args:
91 markdown_content: The markdown text to convert
92 title: Optional title for the document
93 metadata: Optional metadata dict (author, date, etc.)
94 custom_css: Optional CSS string to override defaults
96 Returns:
97 PDF file as bytes
99 Note:
100 WeasyPrint memory usage can spike with large documents.
101 Production deployments should implement:
102 - Memory limits (ulimit)
103 - Timeouts (30-60 seconds)
104 - Worker recycling after 100 requests
105 """
106 try:
107 # Convert markdown to HTML
108 html_content = self._markdown_to_html(
109 markdown_content, title, metadata
110 )
112 # Create HTML document with WeasyPrint
113 html_doc = HTML(string=html_content)
115 # Apply CSS (custom or minimal default)
116 css_list = []
117 if custom_css:
118 css_list.append(CSS(string=custom_css))
119 else:
120 css_list.append(self.minimal_css)
122 # Generate PDF
123 # Use BytesIO to get bytes instead of writing to file
124 pdf_buffer = io.BytesIO()
125 html_doc.write_pdf(pdf_buffer, stylesheets=css_list)
127 # Get the PDF bytes
128 pdf_bytes = pdf_buffer.getvalue()
129 pdf_buffer.close()
131 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes")
132 return pdf_bytes
134 except Exception:
135 logger.exception("Error generating PDF")
136 raise
138 def _markdown_to_html(
139 self,
140 markdown_content: str,
141 title: Optional[str] = None,
142 metadata: Optional[Dict[str, Any]] = None,
143 ) -> str:
144 """
145 Convert markdown to HTML with proper structure.
147 Uses Python-Markdown with extensions for:
148 - Tables
149 - Fenced code blocks
150 - Table of contents
151 - Footnotes
152 """
153 # Parse markdown with extensions
154 md = markdown.Markdown(
155 extensions=[
156 "tables",
157 "fenced_code",
158 "footnotes",
159 "toc",
160 "nl2br", # Convert newlines to <br>
161 "sane_lists",
162 "meta",
163 ]
164 )
166 html_body = md.convert(markdown_content)
168 # Build complete HTML document
169 html_parts = ["<!DOCTYPE html><html><head>"]
170 html_parts.append('<meta charset="utf-8">')
172 if title:
173 html_parts.append(f"<title>{escape(title)}</title>")
175 if metadata:
176 for key, value in metadata.items():
177 html_parts.append(
178 f'<meta name="{escape(str(key))}" content="{escape(str(value))}">'
179 )
181 html_parts.append("</head><body>")
183 # Add the markdown content directly without any extra title or metadata
184 html_parts.append(html_body)
186 # Add footer with LDR attribution
187 html_parts.append("""
188 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;">
189 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant
190 </div>
191 """)
193 html_parts.append("</body></html>")
195 return "".join(html_parts)
198# Singleton instance
199_pdf_service = None
202def get_pdf_service() -> PDFService:
203 """Get or create the PDF service singleton."""
204 global _pdf_service
205 if _pdf_service is None:
206 _pdf_service = PDFService()
207 return _pdf_service