Coverage for src / local_deep_research / web / services / pdf_service.py: 100%

45 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 07:37 +0000

1""" 

2PDF generation service using WeasyPrint. 

3 

4Based on deep research findings, WeasyPrint is the optimal choice for 

5production Flask applications due to: 

6- Pure Python (no external binaries except Pango) 

7- Modern CSS3 support 

8- Active maintenance (v66.0 as of July 2025) 

9- Good paged media features 

10""" 

11 

12import io 

13from typing import Optional, Dict, Any 

14import markdown 

15from weasyprint import HTML, CSS 

16from loguru import logger 

17 

18 

19class PDFService: 

20 """Service for converting markdown to PDF using WeasyPrint.""" 

21 

22 def __init__(self): 

23 """Initialize PDF service with minimal CSS for readability.""" 

24 self.minimal_css = CSS( 

25 string=""" 

26 @page { 

27 size: A4; 

28 margin: 1.5cm; 

29 } 

30 

31 body { 

32 font-family: Arial, sans-serif; 

33 font-size: 10pt; 

34 line-height: 1.4; 

35 } 

36 

37 table { 

38 border-collapse: collapse; 

39 width: 100%; 

40 margin: 0.5em 0; 

41 } 

42 

43 th, td { 

44 border: 1px solid #ccc; 

45 padding: 6px; 

46 text-align: left; 

47 } 

48 

49 th { 

50 background-color: #f0f0f0; 

51 } 

52 

53 h1 { font-size: 16pt; margin: 0.5em 0; } 

54 h2 { font-size: 14pt; margin: 0.5em 0; } 

55 h3 { font-size: 12pt; margin: 0.5em 0; } 

56 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; } 

57 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; } 

58 h6 { font-size: 10pt; margin: 0.5em 0; } 

59 

60 code { 

61 font-family: monospace; 

62 background-color: #f5f5f5; 

63 padding: 1px 3px; 

64 } 

65 

66 pre { 

67 background-color: #f5f5f5; 

68 padding: 8px; 

69 overflow-x: auto; 

70 } 

71 

72 a { 

73 color: #0066cc; 

74 text-decoration: none; 

75 } 

76 """ 

77 ) 

78 

79 def markdown_to_pdf( 

80 self, 

81 markdown_content: str, 

82 title: Optional[str] = None, 

83 metadata: Optional[Dict[str, Any]] = None, 

84 custom_css: Optional[str] = None, 

85 ) -> bytes: 

86 """ 

87 Convert markdown content to PDF. 

88 

89 Args: 

90 markdown_content: The markdown text to convert 

91 title: Optional title for the document 

92 metadata: Optional metadata dict (author, date, etc.) 

93 custom_css: Optional CSS string to override defaults 

94 

95 Returns: 

96 PDF file as bytes 

97 

98 Note: 

99 WeasyPrint memory usage can spike with large documents. 

100 Production deployments should implement: 

101 - Memory limits (ulimit) 

102 - Timeouts (30-60 seconds) 

103 - Worker recycling after 100 requests 

104 """ 

105 try: 

106 # Convert markdown to HTML 

107 html_content = self._markdown_to_html( 

108 markdown_content, title, metadata 

109 ) 

110 

111 # Create HTML document with WeasyPrint 

112 html_doc = HTML(string=html_content) 

113 

114 # Apply CSS (custom or minimal default) 

115 css_list = [] 

116 if custom_css: 

117 css_list.append(CSS(string=custom_css)) 

118 else: 

119 css_list.append(self.minimal_css) 

120 

121 # Generate PDF 

122 # Use BytesIO to get bytes instead of writing to file 

123 pdf_buffer = io.BytesIO() 

124 html_doc.write_pdf(pdf_buffer, stylesheets=css_list) 

125 

126 # Get the PDF bytes 

127 pdf_bytes = pdf_buffer.getvalue() 

128 pdf_buffer.close() 

129 

130 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes") 

131 return pdf_bytes 

132 

133 except Exception as e: 

134 logger.exception(f"Error generating PDF: {str(e)}") 

135 raise 

136 

137 def _markdown_to_html( 

138 self, 

139 markdown_content: str, 

140 title: Optional[str] = None, 

141 metadata: Optional[Dict[str, Any]] = None, 

142 ) -> str: 

143 """ 

144 Convert markdown to HTML with proper structure. 

145 

146 Uses Python-Markdown with extensions for: 

147 - Tables 

148 - Fenced code blocks 

149 - Table of contents 

150 - Footnotes 

151 """ 

152 # Parse markdown with extensions 

153 md = markdown.Markdown( 

154 extensions=[ 

155 "tables", 

156 "fenced_code", 

157 "footnotes", 

158 "toc", 

159 "nl2br", # Convert newlines to <br> 

160 "sane_lists", 

161 "meta", 

162 ] 

163 ) 

164 

165 html_body = md.convert(markdown_content) 

166 

167 # Build complete HTML document 

168 html_parts = ["<!DOCTYPE html><html><head>"] 

169 html_parts.append('<meta charset="utf-8">') 

170 

171 if title: 

172 html_parts.append(f"<title>{title}</title>") 

173 

174 if metadata: 

175 for key, value in metadata.items(): 

176 html_parts.append(f'<meta name="{key}" content="{value}">') 

177 

178 html_parts.append("</head><body>") 

179 

180 # Add the markdown content directly without any extra title or metadata 

181 html_parts.append(html_body) 

182 

183 # Add footer with LDR attribution 

184 html_parts.append(""" 

185 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;"> 

186 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant 

187 </div> 

188 """) 

189 

190 html_parts.append("</body></html>") 

191 

192 return "".join(html_parts) 

193 

194 

195# Singleton instance 

196_pdf_service = None 

197 

198 

199def get_pdf_service() -> PDFService: 

200 """Get or create the PDF service singleton.""" 

201 global _pdf_service 

202 if _pdf_service is None: 

203 _pdf_service = PDFService() 

204 return _pdf_service