Coverage for src/local_deep_research/web/services/pdf

1"""

2PDF generation service using WeasyPrint.

4Based on deep research findings, WeasyPrint is the optimal choice for

5production Flask applications due to:

6- Pure Python (no external binaries except Pango)

7- Modern CSS3 support

8- Active maintenance (v66.0 as of July 2025)

9- Good paged media features

10"""

12import io

13from typing import Optional, Dict, Any

14import markdown

15from weasyprint import HTML, CSS

16from loguru import logger

19class PDFService:

20 """Service for converting markdown to PDF using WeasyPrint."""

22 def __init__(self):

23 """Initialize PDF service with minimal CSS for readability."""

24 self.minimal_css = CSS(

25 string="""

26 @page {

27 size: A4;

28 margin: 1.5cm;

29 }

31 body {

32 font-family: Arial, sans-serif;

33 font-size: 10pt;

34 line-height: 1.4;

35 }

37 table {

38 border-collapse: collapse;

39 width: 100%;

40 margin: 0.5em 0;

41 }

43 th, td {

44 border: 1px solid #ccc;

45 padding: 6px;

46 text-align: left;

47 }

49 th {

50 background-color: #f0f0f0;

51 }

53 h1 { font-size: 16pt; margin: 0.5em 0; }

54 h2 { font-size: 14pt; margin: 0.5em 0; }

55 h3 { font-size: 12pt; margin: 0.5em 0; }

56 h4 { font-size: 11pt; margin: 0.5em 0; font-weight: bold; }

57 h5 { font-size: 10pt; margin: 0.5em 0; font-weight: bold; }

58 h6 { font-size: 10pt; margin: 0.5em 0; }

60 code {

61 font-family: monospace;

62 background-color: #f5f5f5;

63 padding: 1px 3px;

64 }

66 pre {

67 background-color: #f5f5f5;

68 padding: 8px;

69 overflow-x: auto;

70 }

72 a {

73 color: #0066cc;

74 text-decoration: none;

75 }

76 """

77 )

79 def markdown_to_pdf(

80 self,

81 markdown_content: str,

82 title: Optional[str] = None,

83 metadata: Optional[Dict[str, Any]] = None,

84 custom_css: Optional[str] = None,

85 ) -> bytes:

86 """

87 Convert markdown content to PDF.

89 Args:

90 markdown_content: The markdown text to convert

91 title: Optional title for the document

92 metadata: Optional metadata dict (author, date, etc.)

93 custom_css: Optional CSS string to override defaults

95 Returns:

96 PDF file as bytes

98 Note:

99 WeasyPrint memory usage can spike with large documents.

100 Production deployments should implement:

101 - Memory limits (ulimit)

102 - Timeouts (30-60 seconds)

103 - Worker recycling after 100 requests

104 """

105 try:

106 # Convert markdown to HTML

107 html_content = self._markdown_to_html(

108 markdown_content, title, metadata

109 )

110

111 # Create HTML document with WeasyPrint

112 html_doc = HTML(string=html_content)

113

114 # Apply CSS (custom or minimal default)

115 css_list = []

116 if custom_css:

117 css_list.append(CSS(string=custom_css))

118 else:

119 css_list.append(self.minimal_css)

120

121 # Generate PDF

122 # Use BytesIO to get bytes instead of writing to file

123 pdf_buffer = io.BytesIO()

124 html_doc.write_pdf(pdf_buffer, stylesheets=css_list)

125

126 # Get the PDF bytes

127 pdf_bytes = pdf_buffer.getvalue()

128 pdf_buffer.close()

129

130 logger.info(f"Generated PDF, size: {len(pdf_bytes)} bytes")

131 return pdf_bytes

132

133 except Exception as e:

134 logger.exception(f"Error generating PDF: {str(e)}")

135 raise

136

137 def _markdown_to_html(

138 self,

139 markdown_content: str,

140 title: Optional[str] = None,

141 metadata: Optional[Dict[str, Any]] = None,

142 ) -> str:

143 """

144 Convert markdown to HTML with proper structure.

145

146 Uses Python-Markdown with extensions for:

147 - Tables

148 - Fenced code blocks

149 - Table of contents

150 - Footnotes

151 """

152 # Parse markdown with extensions

153 md = markdown.Markdown(

154 extensions=[

155 "tables",

156 "fenced_code",

157 "footnotes",

158 "toc",

159 "nl2br", # Convert newlines to <br>

160 "sane_lists",

161 "meta",

162 ]

163 )

164

165 html_body = md.convert(markdown_content)

166

167 # Build complete HTML document

168 html_parts = ["<!DOCTYPE html><html><head>"]

169 html_parts.append('<meta charset="utf-8">')

170

171 if title:

172 html_parts.append(f"<title>{title}</title>")

173

174 if metadata:

175 for key, value in metadata.items():

176 html_parts.append(f'<meta name="{key}" content="{value}">')

177

178 html_parts.append("</head><body>")

179

180 # Add the markdown content directly without any extra title or metadata

181 html_parts.append(html_body)

182

183 # Add footer with LDR attribution

184 html_parts.append("""

185 <div style="margin-top: 2em; padding-top: 1em; border-top: 1px solid #ddd; font-size: 9pt; color: #666; text-align: center;">

186 Generated by <a href="https://github.com/LearningCircuit/local-deep-research" style="color: #0066cc;">LDR - Local Deep Research</a> | Open Source AI Research Assistant

187 </div>

188 """)

189

190 html_parts.append("</body></html>")

191

192 return "".join(html_parts)

193

194

195# Singleton instance

196_pdf_service = None

197

198

199def get_pdf_service() -> PDFService:

200 """Get or create the PDF service singleton."""

201 global _pdf_service

202 if _pdf_service is None:

203 _pdf_service = PDFService()

204 return _pdf_service

Coverage for src / local_deep_research / web / services / pdf_service.py: 100%

45 statements