Coverage for src / local_deep_research / web / services / pdf_service.py: 100%

46 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2PDF generation service using WeasyPrint. 

3 

4Based on deep research findings, WeasyPrint is the optimal choice for 

5production Flask applications due to: 

6- Pure Python (no external binaries except Pango) 

7- Modern CSS3 support 

8- Active maintenance (v66.0 as of July 2025) 

9- Good paged media features 

10""" 

11 

12import io 

13from html import escape 

14from typing import Optional, Dict, Any 

15import markdown # type: ignore[import-untyped] 

16from weasyprint import HTML, CSS 

17from loguru import logger 

18 

19 

20class PDFService: 

21 """Service for converting markdown to PDF using WeasyPrint.""" 

22 

23 def __init__(self): 

24 """Initialize PDF service with minimal CSS for readability.""" 

25 self.minimal_css = CSS( 

26 string=""" 

27 @page { 

28 size: A4; 

29 margin: 1.5cm; 

30 } 

31 

32 body { 

33 font-family: Arial, sans-serif; 

34 font-size: 10pt; 

35 line-height: 1.4; 

36 } 

37 

38 table { 

39 border-collapse: collapse; 

40 width: 100%; 

41 margin: 0.5em 0; 

42 } 

43 

44 th, td { 

45 border: 1px solid #ccc; 

46 padding: 6px; 

47 text-align: left; 

48 } 

49 

50 th { 

51 background-color: #f0f0f0; 

52 } 

53 

54 h1 { font-size: 16pt; margin: 0.5em 0; } 

55 h2 { font-size: 14pt; margin: 0.5em 0; } 

56 h3 { font-size: 12pt; margin: 0.5em 0; } 

57 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; } 

58 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; } 

59 h6 { font-size: 10pt; margin: 0.5em 0; } 

60 

61 code { 

62 font-family: monospace; 

63 background-color: #f5f5f5; 

64 padding: 1px 3px; 

65 } 

66 

67 pre { 

68 background-color: #f5f5f5; 

69 padding: 8px; 

70 overflow-x: auto; 

71 } 

72 

73 a { 

74 color: #0066cc; 

75 text-decoration: none; 

76 } 

77 """ 

78 ) 

79 

80 def markdown_to_pdf( 

81 self, 

82 markdown_content: str, 

83 title: Optional[str] = None, 

84 metadata: Optional[Dict[str, Any]] = None, 

85 custom_css: Optional[str] = None, 

86 ) -> bytes: 

87 """ 

88 Convert markdown content to PDF. 

89 

90 Args: 

91 markdown_content: The markdown text to convert 

92 title: Optional title for the document 

93 metadata: Optional metadata dict (author, date, etc.) 

94 custom_css: Optional CSS string to override defaults 

95 

96 Returns: 

97 PDF file as bytes 

98 

99 Note: 

100 WeasyPrint memory usage can spike with large documents. 

101 Production deployments should implement: 

102 - Memory limits (ulimit) 

103 - Timeouts (30-60 seconds) 

104 - Worker recycling after 100 requests 

105 """ 

106 try: 

107 # Convert markdown to HTML 

108 html_content = self._markdown_to_html( 

109 markdown_content, title, metadata 

110 ) 

111 

112 # Create HTML document with WeasyPrint 

113 html_doc = HTML(string=html_content) 

114 

115 # Apply CSS (custom or minimal default) 

116 css_list = [] 

117 if custom_css: 

118 css_list.append(CSS(string=custom_css)) 

119 else: 

120 css_list.append(self.minimal_css) 

121 

122 # Generate PDF 

123 # Use BytesIO to get bytes instead of writing to file 

124 pdf_buffer = io.BytesIO() 

125 html_doc.write_pdf(pdf_buffer, stylesheets=css_list) 

126 

127 # Get the PDF bytes 

128 pdf_bytes = pdf_buffer.getvalue() 

129 pdf_buffer.close() 

130 

131 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes") 

132 return pdf_bytes 

133 

134 except Exception: 

135 logger.exception("Error generating PDF") 

136 raise 

137 

138 def _markdown_to_html( 

139 self, 

140 markdown_content: str, 

141 title: Optional[str] = None, 

142 metadata: Optional[Dict[str, Any]] = None, 

143 ) -> str: 

144 """ 

145 Convert markdown to HTML with proper structure. 

146 

147 Uses Python-Markdown with extensions for: 

148 - Tables 

149 - Fenced code blocks 

150 - Table of contents 

151 - Footnotes 

152 """ 

153 # Parse markdown with extensions 

154 md = markdown.Markdown( 

155 extensions=[ 

156 "tables", 

157 "fenced_code", 

158 "footnotes", 

159 "toc", 

160 "nl2br", # Convert newlines to <br> 

161 "sane_lists", 

162 "meta", 

163 ] 

164 ) 

165 

166 html_body = md.convert(markdown_content) 

167 

168 # Build complete HTML document 

169 html_parts = ["<!DOCTYPE html><html><head>"] 

170 html_parts.append('<meta charset="utf-8">') 

171 

172 if title: 

173 html_parts.append(f"<title>{escape(title)}</title>") 

174 

175 if metadata: 

176 for key, value in metadata.items(): 

177 html_parts.append( 

178 f'<meta name="{escape(str(key))}" content="{escape(str(value))}">' 

179 ) 

180 

181 html_parts.append("</head><body>") 

182 

183 # Add the markdown content directly without any extra title or metadata 

184 html_parts.append(html_body) 

185 

186 # Add footer with LDR attribution 

187 html_parts.append(""" 

188 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;"> 

189 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant 

190 </div> 

191 """) 

192 

193 html_parts.append("</body></html>") 

194 

195 return "".join(html_parts) 

196 

197 

198# Singleton instance 

199_pdf_service = None 

200 

201 

202def get_pdf_service() -> PDFService: 

203 """Get or create the PDF service singleton.""" 

204 global _pdf_service 

205 if _pdf_service is None: 

206 _pdf_service = PDFService() 

207 return _pdf_service