Coverage for src/local_deep_research/web/services/pdf_service.py: 86%

77 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2PDF generation service using WeasyPrint. 

3 

4Based on deep research findings, WeasyPrint is the optimal choice for 

5production Flask applications due to: 

6- Pure Python (no external binaries except Pango) 

7- Modern CSS3 support 

8- Active maintenance (v66.0 as of July 2025) 

9- Good paged media features 

10""" 

11 

12import io 

13import platform 

14from html import escape 

15from typing import Optional, Dict, Any 

16import markdown # type: ignore[import-untyped] 

17from loguru import logger 

18 

19try: 

20 from weasyprint import HTML, CSS 

21 from weasyprint.urls import URLFetcher 

22 

23 WEASYPRINT_AVAILABLE = True 

24except (OSError, ImportError) as _weasyprint_err: 

25 HTML = None # type: ignore[assignment,misc] 

26 CSS = None # type: ignore[assignment,misc] 

27 URLFetcher = None # type: ignore[assignment,misc] 

28 WEASYPRINT_AVAILABLE = False 

29 logger.warning("WeasyPrint not available — PDF export will be disabled") 

30 

31from ...security import validate_url 

32 

33 

34_WEASYPRINT_DOCS_URL = ( 

35 "https://doc.courtbouillon.org/weasyprint/stable/first_steps.html" 

36) 

37 

38 

39class UnsafePDFResourceURLError(ValueError): 

40 """Subclasses ValueError so WeasyPrint skips the resource instead of aborting the render.""" 

41 

42 

43# Module-level URLFetcher preserves the allow_redirects=False posture that 

44# default_url_fetcher hard-coded. Redirects disabled keeps the SSRF guard 

45# airtight — validate_url only inspects the initial URL, so a 30x to a 

46# cloud metadata endpoint (see ssrf_validator.ALWAYS_BLOCKED_METADATA_IPS) 

47# would otherwise slip past. 

48_URL_FETCHER = ( 

49 URLFetcher(allow_redirects=False) if WEASYPRINT_AVAILABLE else None 

50) 

51 

52 

53def _safe_url_fetcher(url): 

54 """WeasyPrint url_fetcher that blocks SSRF targets (GHSA-fj2m-qvh9-jq4q).""" 

55 if not validate_url(url): 

56 logger.warning(f"Blocked unsafe URL in PDF rendering: {url}") 

57 raise UnsafePDFResourceURLError( 

58 f"Blocked unsafe URL in PDF rendering: {url}" 

59 ) 

60 return _URL_FETCHER.fetch(url) 

61 

62 

63class MissingPDFDependencyError(RuntimeError): 

64 """Raised when WeasyPrint system libraries are unavailable. 

65 

66 Distinct from generic RuntimeError so the web layer can surface this 

67 message to users without also exposing unrelated RuntimeErrors 

68 (e.g., pandoc subprocess stderr from ODT export). 

69 """ 

70 

71 

72def get_weasyprint_install_instructions() -> str: 

73 """Return platform-specific install instructions for WeasyPrint system deps.""" 

74 system = platform.system() 

75 if system == "Darwin": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 return ( 

77 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n" 

78 "Install with: brew install weasyprint\n" 

79 f"See: {_WEASYPRINT_DOCS_URL}#macos" 

80 ) 

81 if system == "Linux": 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true

82 return ( 

83 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n" 

84 f"See: {_WEASYPRINT_DOCS_URL}#linux" 

85 ) 

86 if system == "Windows": 

87 return ( 

88 "PDF export requires Pango system libraries.\n" 

89 f"See: {_WEASYPRINT_DOCS_URL}#windows" 

90 ) 

91 return ( 

92 "PDF export requires WeasyPrint system libraries (Pango, Cairo, GLib).\n" 

93 f"See: {_WEASYPRINT_DOCS_URL}" 

94 ) 

95 

96 

97class PDFService: 

98 """Service for converting markdown to PDF using WeasyPrint.""" 

99 

100 def __init__(self): 

101 """Initialize PDF service with minimal CSS for readability.""" 

102 # CJK families are listed as fallbacks so WeasyPrint substitutes a 

103 # glyph-bearing font when the primary stack lacks coverage. Without 

104 # this, Chinese/Japanese/Korean text disappears silently from the 

105 # PDF even though it renders fine in the HTML view (issue #4055). 

106 # Glyphs still require the corresponding system font (e.g. 

107 # fonts-noto-cjk) to actually be installed. 

108 self.minimal_css = CSS( 

109 string=""" 

110 @page { 

111 size: A4; 

112 margin: 1.5cm; 

113 } 

114 

115 body { 

116 font-family: Arial, "Noto Sans CJK SC", "Noto Sans CJK TC", 

117 "Noto Sans CJK JP", "Noto Sans CJK KR", "Noto Sans SC", 

118 "PingFang SC", "PingFang TC", "Hiragino Sans", 

119 "Hiragino Kaku Gothic ProN", "Apple SD Gothic Neo", 

120 "Microsoft YaHei", "Microsoft JhengHei", 

121 "Yu Gothic", "Malgun Gothic", "SimSun", sans-serif; 

122 font-size: 10pt; 

123 line-height: 1.4; 

124 } 

125 

126 table { 

127 border-collapse: collapse; 

128 width: 100%; 

129 margin: 0.5em 0; 

130 } 

131 

132 th, td { 

133 border: 1px solid #ccc; 

134 padding: 6px; 

135 text-align: left; 

136 } 

137 

138 th { 

139 background-color: #f0f0f0; 

140 } 

141 

142 h1 { font-size: 16pt; margin: 0.5em 0; } 

143 h2 { font-size: 14pt; margin: 0.5em 0; } 

144 h3 { font-size: 12pt; margin: 0.5em 0; } 

145 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; } 

146 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; } 

147 h6 { font-size: 10pt; margin: 0.5em 0; } 

148 

149 code, pre { 

150 font-family: monospace, "Noto Sans Mono CJK SC", 

151 "Noto Sans Mono CJK TC", "Noto Sans Mono CJK JP", 

152 "Noto Sans Mono CJK KR", "Noto Sans CJK SC", 

153 "PingFang SC", "Hiragino Sans", "Apple SD Gothic Neo", 

154 "Microsoft YaHei", "SimSun"; 

155 background-color: #f5f5f5; 

156 } 

157 

158 code { 

159 padding: 1px 3px; 

160 } 

161 

162 pre { 

163 padding: 8px; 

164 overflow-x: auto; 

165 } 

166 

167 a { 

168 color: #0066cc; 

169 text-decoration: none; 

170 } 

171 """ 

172 ) 

173 

174 def markdown_to_pdf( 

175 self, 

176 markdown_content: str, 

177 title: Optional[str] = None, 

178 metadata: Optional[Dict[str, Any]] = None, 

179 custom_css: Optional[str] = None, 

180 ) -> bytes: 

181 """ 

182 Convert markdown content to PDF. 

183 

184 Args: 

185 markdown_content: The markdown text to convert 

186 title: Optional title for the document 

187 metadata: Optional metadata dict (author, date, etc.) 

188 custom_css: Optional CSS string to override defaults 

189 

190 Returns: 

191 PDF file as bytes 

192 

193 Note: 

194 WeasyPrint memory usage can spike with large documents. 

195 Production deployments should implement: 

196 - Memory limits (ulimit) 

197 - Timeouts (30-60 seconds) 

198 - Worker recycling after 100 requests 

199 """ 

200 try: 

201 # Convert markdown to HTML 

202 html_content = self._markdown_to_html( 

203 markdown_content, title, metadata 

204 ) 

205 

206 # url_fetcher blocks SSRF targets reachable via body/citation URLs. 

207 html_doc = HTML(string=html_content, url_fetcher=_safe_url_fetcher) 

208 

209 # Apply CSS (custom or minimal default) 

210 css_list = [] 

211 if custom_css: 

212 css_list.append(CSS(string=custom_css)) 

213 else: 

214 css_list.append(self.minimal_css) 

215 

216 # Generate PDF 

217 # Use BytesIO to get bytes instead of writing to file 

218 pdf_buffer = io.BytesIO() 

219 html_doc.write_pdf(pdf_buffer, stylesheets=css_list) 

220 

221 # Get the PDF bytes 

222 pdf_bytes = pdf_buffer.getvalue() 

223 pdf_buffer.close() 

224 

225 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes") 

226 return pdf_bytes 

227 

228 except Exception: 

229 logger.exception("Error generating PDF") 

230 raise 

231 

232 def _markdown_to_html( 

233 self, 

234 markdown_content: str, 

235 title: Optional[str] = None, 

236 metadata: Optional[Dict[str, Any]] = None, 

237 ) -> str: 

238 """ 

239 Convert markdown to HTML with proper structure. 

240 

241 Uses Python-Markdown with extensions for: 

242 - Tables 

243 - Fenced code blocks 

244 - Table of contents 

245 - Footnotes 

246 """ 

247 # Parse markdown with extensions 

248 md = markdown.Markdown( 

249 extensions=[ 

250 "tables", 

251 "fenced_code", 

252 "footnotes", 

253 "toc", 

254 "nl2br", # Convert newlines to <br> 

255 "sane_lists", 

256 "meta", 

257 ] 

258 ) 

259 

260 html_body = md.convert(markdown_content) 

261 

262 # Build complete HTML document 

263 html_parts = ["<!DOCTYPE html><html><head>"] 

264 html_parts.append('<meta charset="utf-8">') 

265 

266 if title: 

267 html_parts.append(f"<title>{escape(title)}</title>") 

268 

269 if metadata: 

270 for key, value in metadata.items(): 

271 html_parts.append( 

272 f'<meta name="{escape(str(key))}" content="{escape(str(value))}">' 

273 ) 

274 

275 html_parts.append("</head><body>") 

276 

277 # Add the markdown content directly without any extra title or metadata 

278 html_parts.append(html_body) 

279 

280 # Add footer with LDR attribution 

281 html_parts.append(""" 

282 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;"> 

283 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant 

284 </div> 

285 """) 

286 

287 html_parts.append("</body></html>") 

288 

289 return "".join(html_parts) 

290 

291 

292# Singleton instance 

293_pdf_service = None 

294 

295 

296def get_pdf_service() -> PDFService: 

297 """Get or create the PDF service singleton. 

298 

299 Raises: 

300 MissingPDFDependencyError: If WeasyPrint system libraries are not 

301 available, with platform-specific installation instructions. 

302 """ 

303 if not WEASYPRINT_AVAILABLE: 

304 raise MissingPDFDependencyError(get_weasyprint_install_instructions()) 

305 global _pdf_service 

306 if _pdf_service is None: 

307 _pdf_service = PDFService() 

308 return _pdf_service