Coverage for src / local_deep_research / exporters / odt_exporter.py: 91%

66 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""ODT export service using pypandoc. 

2 

3This module provides the ODTExporter class for converting markdown content 

4to OpenDocument Text (ODT) format using pypandoc (Pandoc wrapper). 

5 

6Pandoc is the industry standard for document conversion and handles all 

7markdown features natively. Note: Requires Pandoc to be installed on the 

8system or use pypandoc_binary which bundles it. 

9""" 

10 

11import subprocess 

12from typing import Optional 

13 

14from loguru import logger 

15 

16# pypandoc is optional - used only to locate the bundled pandoc binary 

17try: 

18 import pypandoc 

19 

20 PYPANDOC_AVAILABLE = True 

21except ImportError: 

22 pypandoc = None 

23 PYPANDOC_AVAILABLE = False 

24 

25from .base import BaseExporter, ExportOptions, ExportResult 

26from .registry import ExporterRegistry 

27 

28# Maximum content size (50 MB) to prevent OOM errors 

29MAX_CONTENT_SIZE = 50 * 1024 * 1024 

30 

31 

32@ExporterRegistry.register 

33class ODTExporter(BaseExporter): 

34 """Service for converting markdown to ODT using pypandoc. 

35 

36 This exporter uses Pandoc (via pypandoc) to convert markdown content 

37 to OpenDocument Text format, which can be opened in LibreOffice Writer, 

38 Microsoft Word, and other office applications. 

39 

40 Pandoc is the industry standard for document conversion and handles 

41 all markdown features (tables, code blocks, lists, etc.) natively. 

42 

43 The conversion is performed entirely in memory by piping markdown 

44 to pandoc's stdin and capturing ODT output from stdout. 

45 """ 

46 

47 @property 

48 def format_name(self) -> str: 

49 return "odt" 

50 

51 @property 

52 def file_extension(self) -> str: 

53 return ".odt" 

54 

55 @property 

56 def mimetype(self) -> str: 

57 return "application/vnd.oasis.opendocument.text" 

58 

59 def export( 

60 self, 

61 markdown_content: str, 

62 options: Optional[ExportOptions] = None, 

63 ) -> ExportResult: 

64 """Convert markdown content to ODT using Pandoc. 

65 

66 The conversion runs entirely in memory: markdown is piped to 

67 pandoc's stdin and ODT bytes are captured from stdout, avoiding 

68 any temporary files on disk. 

69 

70 Args: 

71 markdown_content: The markdown text to convert 

72 options: Optional export options (title, metadata) 

73 

74 Returns: 

75 ExportResult with ODT content as bytes, filename, and mimetype 

76 

77 Raises: 

78 ValueError: If content exceeds maximum size limit 

79 RuntimeError: If Pandoc conversion fails 

80 """ 

81 try: 

82 # Check if pypandoc is available 

83 if not PYPANDOC_AVAILABLE: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 raise RuntimeError( 

85 "ODT export requires pypandoc. Install with: pip install pypandoc-binary" 

86 ) 

87 

88 # Check content size limit to prevent OOM errors 

89 if len(markdown_content) > MAX_CONTENT_SIZE: 

90 raise ValueError( 

91 f"Content exceeds maximum size of " 

92 f"{MAX_CONTENT_SIZE // (1024 * 1024)} MB" 

93 ) 

94 

95 options = options or ExportOptions() 

96 

97 # Prepend title if needed (for document formats like ODT) 

98 markdown_content = self._prepend_title_if_needed( 

99 markdown_content, options.title 

100 ) 

101 

102 # Add LDR attribution footer 

103 content_with_footer = self._add_footer(markdown_content) 

104 

105 # Build pandoc args for metadata (sanitized) 

106 extra_args = [] 

107 if options.title: 

108 safe_title = self._sanitize_metadata(options.title) 

109 extra_args.append(f"--metadata=title:{safe_title}") 

110 if options.metadata: 

111 if options.metadata.get("author"): 

112 safe_author = self._sanitize_metadata( 

113 options.metadata["author"] 

114 ) 

115 extra_args.append(f"--metadata=author:{safe_author}") 

116 if options.metadata.get("date"): 

117 safe_date = self._sanitize_metadata( 

118 options.metadata["date"] 

119 ) 

120 extra_args.append(f"--metadata=date:{safe_date}") 

121 

122 # Convert in memory: pipe markdown via stdin, capture ODT from stdout 

123 pandoc_path = pypandoc.get_pandoc_path() 

124 cmd = [pandoc_path, "-f", "markdown", "-t", "odt", "-o", "-"] 

125 cmd.extend(extra_args) 

126 

127 result = subprocess.run( 

128 cmd, 

129 input=content_with_footer.encode("utf-8"), 

130 capture_output=True, 

131 check=True, 

132 ) 

133 

134 odt_bytes = result.stdout 

135 

136 if not odt_bytes: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 raise RuntimeError( 

138 "Pandoc conversion failed - no output produced" 

139 ) 

140 

141 filename = self._generate_safe_filename(options.title) 

142 

143 logger.info( 

144 f"Generated ODT in memory, size: {len(odt_bytes)} bytes" 

145 ) 

146 

147 return ExportResult( 

148 content=odt_bytes, 

149 filename=filename, 

150 mimetype=self.mimetype, 

151 ) 

152 

153 except subprocess.CalledProcessError as e: 

154 stderr = ( 

155 e.stderr.decode("utf-8", errors="replace") 

156 if e.stderr 

157 else "unknown error" 

158 ) 

159 logger.exception(f"Pandoc conversion failed: {stderr}") 

160 raise RuntimeError(f"Pandoc conversion failed: {stderr}") from e 

161 except Exception: 

162 logger.exception("Error generating ODT") 

163 raise 

164 

165 def _add_footer(self, markdown_content: str) -> str: 

166 """Add LDR attribution footer to markdown content. 

167 

168 Args: 

169 markdown_content: The original markdown text 

170 

171 Returns: 

172 Markdown with footer appended 

173 """ 

174 footer = ( 

175 "\n\n---\n\n" 

176 "*Generated by [LDR - Local Deep Research]" 

177 "(https://github.com/LearningCircuit/local-deep-research) | " 

178 "Open Source AI Research Assistant*" 

179 ) 

180 return markdown_content + footer 

181 

182 def _sanitize_metadata(self, value: str) -> str: 

183 """Sanitize metadata value to prevent argument injection. 

184 

185 Removes potential pandoc argument injection patterns from user-supplied 

186 metadata values. 

187 

188 Args: 

189 value: The metadata value to sanitize 

190 

191 Returns: 

192 Sanitized metadata value safe for pandoc arguments 

193 """ 

194 # Remove potential argument injection patterns 

195 safe_value = value.replace("--", "").replace("\n", " ") 

196 return safe_value