Coverage for src/local_deep_research/web/services/pdf_service.py: 86%
77 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2PDF generation service using WeasyPrint.
4Based on deep research findings, WeasyPrint is the optimal choice for
5production Flask applications due to:
6- Pure Python (no external binaries except Pango)
7- Modern CSS3 support
8- Active maintenance (v66.0 as of July 2025)
9- Good paged media features
10"""
12import io
13import platform
14from html import escape
15from typing import Optional, Dict, Any
16import markdown # type: ignore[import-untyped]
17from loguru import logger
19try:
20 from weasyprint import HTML, CSS
21 from weasyprint.urls import URLFetcher
23 WEASYPRINT_AVAILABLE = True
24except (OSError, ImportError) as _weasyprint_err:
25 HTML = None # type: ignore[assignment,misc]
26 CSS = None # type: ignore[assignment,misc]
27 URLFetcher = None # type: ignore[assignment,misc]
28 WEASYPRINT_AVAILABLE = False
29 logger.warning("WeasyPrint not available — PDF export will be disabled")
31from ...security import validate_url
34_WEASYPRINT_DOCS_URL = (
35 "https://doc.courtbouillon.org/weasyprint/stable/first_steps.html"
36)
39class UnsafePDFResourceURLError(ValueError):
40 """Subclasses ValueError so WeasyPrint skips the resource instead of aborting the render."""
43# Module-level URLFetcher preserves the allow_redirects=False posture that
44# default_url_fetcher hard-coded. Redirects disabled keeps the SSRF guard
45# airtight — validate_url only inspects the initial URL, so a 30x to a
46# cloud metadata endpoint (see ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS)
47# would otherwise slip past.
48_URL_FETCHER = (
49 URLFetcher(allow_redirects=False) if WEASYPRINT_AVAILABLE else None
50)
53def _safe_url_fetcher(url):
54 """WeasyPrint url_fetcher that blocks SSRF targets (GHSA-fj2m-qvh9-jq4q)."""
55 if not validate_url(url):
56 logger.warning(f"Blocked unsafe URL in PDF rendering: {url}")
57 raise UnsafePDFResourceURLError(
58 f"Blocked unsafe URL in PDF rendering: {url}"
59 )
60 return _URL_FETCHER.fetch(url)
63class MissingPDFDependencyError(RuntimeError):
64 """Raised when WeasyPrint system libraries are unavailable.
66 Distinct from generic RuntimeError so the web layer can surface this
67 message to users without also exposing unrelated RuntimeErrors
68 (e.g., pandoc subprocess stderr from ODT export).
69 """
72def get_weasyprint_install_instructions() -> str:
73 """Return platform-specific install instructions for WeasyPrint system deps."""
74 system = platform.system()
75 if system == "Darwin": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 return (
77 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n"
78 "Install with: brew install weasyprint\n"
79 f"See: {_WEASYPRINT_DOCS_URL}#macos"
80 )
81 if system == "Linux": 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true
82 return (
83 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n"
84 f"See: {_WEASYPRINT_DOCS_URL}#linux"
85 )
86 if system == "Windows":
87 return (
88 "PDF export requires Pango system libraries.\n"
89 f"See: {_WEASYPRINT_DOCS_URL}#windows"
90 )
91 return (
92 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n"
93 f"See: {_WEASYPRINT_DOCS_URL}"
94 )
97class PDFService:
98 """Service for converting markdown to PDF using WeasyPrint."""
100 def __init__(self):
101 """Initialize PDF service with minimal CSS for readability."""
102 # CJK families are listed as fallbacks so WeasyPrint substitutes a
103 # glyph-bearing font when the primary stack lacks coverage. Without
104 # this, Chinese/Japanese/Korean text disappears silently from the
105 # PDF even though it renders fine in the HTML view (issue #4055).
106 # Glyphs still require the corresponding system font (e.g.
107 # fonts-noto-cjk) to actually be installed.
108 self.minimal_css = CSS(
109 string="""
110 @page {
111 size: A4;
112 margin: 1.5cm;
113 }
115 body {
116 font-family: Arial, "Noto Sans CJK SC", "Noto Sans CJK TC",
117 "Noto Sans CJK JP", "Noto Sans CJK KR", "Noto Sans SC",
118 "PingFang SC", "PingFang TC", "Hiragino Sans",
119 "Hiragino Kaku Gothic ProN", "Apple SD Gothic Neo",
120 "Microsoft YaHei", "Microsoft JhengHei",
121 "Yu Gothic", "Malgun Gothic", "SimSun", sans-serif;
122 font-size: 10pt;
123 line-height: 1.4;
124 }
126 table {
127 border-collapse: collapse;
128 width: 100%;
129 margin: 0.5em 0;
130 }
132 th, td {
133 border: 1px solid #ccc;
134 padding: 6px;
135 text-align: left;
136 }
138 th {
139 background-color: #f0f0f0;
140 }
142 h1 { font-size: 16pt; margin: 0.5em 0; }
143 h2 { font-size: 14pt; margin: 0.5em 0; }
144 h3 { font-size: 12pt; margin: 0.5em 0; }
145 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; }
146 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; }
147 h6 { font-size: 10pt; margin: 0.5em 0; }
149 code, pre {
150 font-family: monospace, "Noto Sans Mono CJK SC",
151 "Noto Sans Mono CJK TC", "Noto Sans Mono CJK JP",
152 "Noto Sans Mono CJK KR", "Noto Sans CJK SC",
153 "PingFang SC", "Hiragino Sans", "Apple SD Gothic Neo",
154 "Microsoft YaHei", "SimSun";
155 background-color: #f5f5f5;
156 }
158 code {
159 padding: 1px 3px;
160 }
162 pre {
163 padding: 8px;
164 overflow-x: auto;
165 }
167 a {
168 color: #0066cc;
169 text-decoration: none;
170 }
171 """
172 )
174 def markdown_to_pdf(
175 self,
176 markdown_content: str,
177 title: Optional[str] = None,
178 metadata: Optional[Dict[str, Any]] = None,
179 custom_css: Optional[str] = None,
180 ) -> bytes:
181 """
182 Convert markdown content to PDF.
184 Args:
185 markdown_content: The markdown text to convert
186 title: Optional title for the document
187 metadata: Optional metadata dict (author, date, etc.)
188 custom_css: Optional CSS string to override defaults
190 Returns:
191 PDF file as bytes
193 Note:
194 WeasyPrint memory usage can spike with large documents.
195 Production deployments should implement:
196 - Memory limits (ulimit)
197 - Timeouts (30-60 seconds)
198 - Worker recycling after 100 requests
199 """
200 try:
201 # Convert markdown to HTML
202 html_content = self._markdown_to_html(
203 markdown_content, title, metadata
204 )
206 # url_fetcher blocks SSRF targets reachable via body/citation URLs.
207 html_doc = HTML(string=html_content, url_fetcher=_safe_url_fetcher)
209 # Apply CSS (custom or minimal default)
210 css_list = []
211 if custom_css:
212 css_list.append(CSS(string=custom_css))
213 else:
214 css_list.append(self.minimal_css)
216 # Generate PDF
217 # Use BytesIO to get bytes instead of writing to file
218 pdf_buffer = io.BytesIO()
219 html_doc.write_pdf(pdf_buffer, stylesheets=css_list)
221 # Get the PDF bytes
222 pdf_bytes = pdf_buffer.getvalue()
223 pdf_buffer.close()
225 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes")
226 return pdf_bytes
228 except Exception:
229 logger.exception("Error generating PDF")
230 raise
232 def _markdown_to_html(
233 self,
234 markdown_content: str,
235 title: Optional[str] = None,
236 metadata: Optional[Dict[str, Any]] = None,
237 ) -> str:
238 """
239 Convert markdown to HTML with proper structure.
241 Uses Python-Markdown with extensions for:
242 - Tables
243 - Fenced code blocks
244 - Table of contents
245 - Footnotes
246 """
247 # Parse markdown with extensions
248 md = markdown.Markdown(
249 extensions=[
250 "tables",
251 "fenced_code",
252 "footnotes",
253 "toc",
254 "nl2br", # Convert newlines to <br>
255 "sane_lists",
256 "meta",
257 ]
258 )
260 html_body = md.convert(markdown_content)
262 # Build complete HTML document
263 html_parts = ["<!DOCTYPE html><html><head>"]
264 html_parts.append('<meta charset="utf-8">')
266 if title:
267 html_parts.append(f"<title>{escape(title)}</title>")
269 if metadata:
270 for key, value in metadata.items():
271 html_parts.append(
272 f'<meta name="{escape(str(key))}" content="{escape(str(value))}">'
273 )
275 html_parts.append("</head><body>")
277 # Add the markdown content directly without any extra title or metadata
278 html_parts.append(html_body)
280 # Add footer with LDR attribution
281 html_parts.append("""
282 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;">
283 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant
284 </div>
285 """)
287 html_parts.append("</body></html>")
289 return "".join(html_parts)
292# Singleton instance
293_pdf_service = None
296def get_pdf_service() -> PDFService:
297 """Get or create the PDF service singleton.
299 Raises:
300 MissingPDFDependencyError: If WeasyPrint system libraries are not
301 available, with platform-specific installation instructions.
302 """
303 if not WEASYPRINT_AVAILABLE:
304 raise MissingPDFDependencyError(get_weasyprint_install_instructions())
305 global _pdf_service
306 if _pdf_service is None:
307 _pdf_service = PDFService()
308 return _pdf_service