Coverage for src / local_deep_research / exporters / odt_exporter.py: 91%

63 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""ODT export service using pypandoc. 

2 

3This module provides the ODTExporter class for converting markdown content 

4to OpenDocument Text (ODT) format using pypandoc (Pandoc wrapper). 

5 

6Pandoc is the industry standard for document conversion and handles all 

7markdown features natively. Note: Requires Pandoc to be installed on the 

8system or use pypandoc_binary which bundles it. 

9""" 

10 

11import subprocess 

12from typing import Optional 

13 

14from loguru import logger 

15 

16# pypandoc is optional - used only to locate the bundled pandoc binary 

17try: 

18 import pypandoc 

19 

20 PYPANDOC_AVAILABLE = True 

21except ImportError: 

22 pypandoc = None 

23 PYPANDOC_AVAILABLE = False 

24 

25from .base import BaseExporter, ExportOptions, ExportResult 

26from .registry import ExporterRegistry 

27 

28 

29@ExporterRegistry.register 

30class ODTExporter(BaseExporter): 

31 """Service for converting markdown to ODT using pypandoc. 

32 

33 This exporter uses Pandoc (via pypandoc) to convert markdown content 

34 to OpenDocument Text format, which can be opened in LibreOffice Writer, 

35 Microsoft Word, and other office applications. 

36 

37 Pandoc is the industry standard for document conversion and handles 

38 all markdown features (tables, code blocks, lists, etc.) natively. 

39 

40 The conversion is performed entirely in memory by piping markdown 

41 to pandoc's stdin and capturing ODT output from stdout. 

42 """ 

43 

44 @property 

45 def format_name(self) -> str: 

46 return "odt" 

47 

48 @property 

49 def file_extension(self) -> str: 

50 return ".odt" 

51 

52 @property 

53 def mimetype(self) -> str: 

54 return "application/vnd.oasis.opendocument.text" 

55 

56 def export( 

57 self, 

58 markdown_content: str, 

59 options: Optional[ExportOptions] = None, 

60 ) -> ExportResult: 

61 """Convert markdown content to ODT using Pandoc. 

62 

63 The conversion runs entirely in memory: markdown is piped to 

64 pandoc's stdin and ODT bytes are captured from stdout, avoiding 

65 any temporary files on disk. 

66 

67 Args: 

68 markdown_content: The markdown text to convert 

69 options: Optional export options (title, metadata) 

70 

71 Returns: 

72 ExportResult with ODT content as bytes, filename, and mimetype 

73 

74 Raises: 

75 ValueError: If content exceeds maximum size limit 

76 RuntimeError: If Pandoc conversion fails 

77 """ 

78 # Check if pypandoc is available 

79 if not PYPANDOC_AVAILABLE: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 raise RuntimeError( 

81 "ODT export requires pypandoc. Install with: pip install pypandoc-binary" 

82 ) 

83 

84 try: 

85 # Check content size limit to prevent OOM errors 

86 self._validate_content_size(markdown_content) 

87 

88 options = options or ExportOptions() 

89 

90 # Prepend title if needed (for document formats like ODT) 

91 markdown_content = self._prepend_title_if_needed( 

92 markdown_content, options.title 

93 ) 

94 

95 # Add LDR attribution footer 

96 content_with_footer = self._add_footer(markdown_content) 

97 

98 # Build pandoc args for metadata (sanitized) 

99 extra_args = [] 

100 if options.title: 

101 safe_title = self._sanitize_metadata(options.title) 

102 extra_args.append(f"--metadata=title:{safe_title}") 

103 if options.metadata: 

104 if options.metadata.get("author"): 

105 safe_author = self._sanitize_metadata( 

106 options.metadata["author"] 

107 ) 

108 extra_args.append(f"--metadata=author:{safe_author}") 

109 if options.metadata.get("date"): 

110 safe_date = self._sanitize_metadata( 

111 options.metadata["date"] 

112 ) 

113 extra_args.append(f"--metadata=date:{safe_date}") 

114 

115 # Convert in memory: pipe markdown via stdin, capture ODT from stdout 

116 pandoc_path = pypandoc.get_pandoc_path() 

117 cmd = [pandoc_path, "-f", "markdown", "-t", "odt", "-o", "-"] 

118 cmd.extend(extra_args) 

119 

120 result = subprocess.run( 

121 cmd, 

122 input=content_with_footer.encode("utf-8"), 

123 capture_output=True, 

124 check=True, 

125 ) 

126 

127 odt_bytes = result.stdout 

128 

129 if not odt_bytes: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 raise RuntimeError( # noqa: TRY301 — except only adds logging before re-raise 

131 "Pandoc conversion failed - no output produced" 

132 ) 

133 

134 filename = self._generate_safe_filename(options.title) 

135 

136 logger.info( 

137 f"Generated ODT in memory, size: {len(odt_bytes)} bytes" 

138 ) 

139 

140 return ExportResult( 

141 content=odt_bytes, 

142 filename=filename, 

143 mimetype=self.mimetype, 

144 ) 

145 

146 except subprocess.CalledProcessError as e: 

147 stderr = ( 

148 e.stderr.decode("utf-8", errors="replace") 

149 if e.stderr 

150 else "unknown error" 

151 ) 

152 logger.exception(f"Pandoc conversion failed: {stderr}") 

153 raise RuntimeError(f"Pandoc conversion failed: {stderr}") from e 

154 except Exception: 

155 logger.exception("Error generating ODT") 

156 raise 

157 

158 def _add_footer(self, markdown_content: str) -> str: 

159 """Add LDR attribution footer to markdown content. 

160 

161 Args: 

162 markdown_content: The original markdown text 

163 

164 Returns: 

165 Markdown with footer appended 

166 """ 

167 footer = ( 

168 "\n\n---\n\n" 

169 "*Generated by [LDR - Local Deep Research]" 

170 "(https://github.com/LearningCircuit/local-deep-research) | " 

171 "Open Source AI Research Assistant*" 

172 ) 

173 return markdown_content + footer 

174 

175 def _sanitize_metadata(self, value: str) -> str: 

176 """Sanitize metadata value to prevent argument injection. 

177 

178 Removes potential pandoc argument injection patterns from user-supplied 

179 metadata values. 

180 

181 Args: 

182 value: The metadata value to sanitize 

183 

184 Returns: 

185 Sanitized metadata value safe for pandoc arguments 

186 """ 

187 # Remove potential argument injection patterns 

188 return value.replace("--", "").replace("\n", " ")