Coverage for src / local_deep_research / exporters / odt_exporter.py: 91%
66 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""ODT export service using pypandoc.
3This module provides the ODTExporter class for converting markdown content
4to OpenDocument Text (ODT) format using pypandoc (Pandoc wrapper).
6Pandoc is the industry standard for document conversion and handles all
7markdown features natively. Note: Requires Pandoc to be installed on the
8system or use pypandoc_binary which bundles it.
9"""
11import subprocess
12from typing import Optional
14from loguru import logger
16# pypandoc is optional - used only to locate the bundled pandoc binary
17try:
18 import pypandoc
20 PYPANDOC_AVAILABLE = True
21except ImportError:
22 pypandoc = None
23 PYPANDOC_AVAILABLE = False
25from .base import BaseExporter, ExportOptions, ExportResult
26from .registry import ExporterRegistry
28# Maximum content size (50 MB) to prevent OOM errors
29MAX_CONTENT_SIZE = 50 * 1024 * 1024
32@ExporterRegistry.register
33class ODTExporter(BaseExporter):
34 """Service for converting markdown to ODT using pypandoc.
36 This exporter uses Pandoc (via pypandoc) to convert markdown content
37 to OpenDocument Text format, which can be opened in LibreOffice Writer,
38 Microsoft Word, and other office applications.
40 Pandoc is the industry standard for document conversion and handles
41 all markdown features (tables, code blocks, lists, etc.) natively.
43 The conversion is performed entirely in memory by piping markdown
44 to pandoc's stdin and capturing ODT output from stdout.
45 """
47 @property
48 def format_name(self) -> str:
49 return "odt"
51 @property
52 def file_extension(self) -> str:
53 return ".odt"
55 @property
56 def mimetype(self) -> str:
57 return "application/vnd.oasis.opendocument.text"
59 def export(
60 self,
61 markdown_content: str,
62 options: Optional[ExportOptions] = None,
63 ) -> ExportResult:
64 """Convert markdown content to ODT using Pandoc.
66 The conversion runs entirely in memory: markdown is piped to
67 pandoc's stdin and ODT bytes are captured from stdout, avoiding
68 any temporary files on disk.
70 Args:
71 markdown_content: The markdown text to convert
72 options: Optional export options (title, metadata)
74 Returns:
75 ExportResult with ODT content as bytes, filename, and mimetype
77 Raises:
78 ValueError: If content exceeds maximum size limit
79 RuntimeError: If Pandoc conversion fails
80 """
81 try:
82 # Check if pypandoc is available
83 if not PYPANDOC_AVAILABLE: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 raise RuntimeError(
85 "ODT export requires pypandoc. Install with: pip install pypandoc-binary"
86 )
88 # Check content size limit to prevent OOM errors
89 if len(markdown_content) > MAX_CONTENT_SIZE:
90 raise ValueError(
91 f"Content exceeds maximum size of "
92 f"{MAX_CONTENT_SIZE // (1024 * 1024)} MB"
93 )
95 options = options or ExportOptions()
97 # Prepend title if needed (for document formats like ODT)
98 markdown_content = self._prepend_title_if_needed(
99 markdown_content, options.title
100 )
102 # Add LDR attribution footer
103 content_with_footer = self._add_footer(markdown_content)
105 # Build pandoc args for metadata (sanitized)
106 extra_args = []
107 if options.title:
108 safe_title = self._sanitize_metadata(options.title)
109 extra_args.append(f"--metadata=title:{safe_title}")
110 if options.metadata:
111 if options.metadata.get("author"):
112 safe_author = self._sanitize_metadata(
113 options.metadata["author"]
114 )
115 extra_args.append(f"--metadata=author:{safe_author}")
116 if options.metadata.get("date"):
117 safe_date = self._sanitize_metadata(
118 options.metadata["date"]
119 )
120 extra_args.append(f"--metadata=date:{safe_date}")
122 # Convert in memory: pipe markdown via stdin, capture ODT from stdout
123 pandoc_path = pypandoc.get_pandoc_path()
124 cmd = [pandoc_path, "-f", "markdown", "-t", "odt", "-o", "-"]
125 cmd.extend(extra_args)
127 result = subprocess.run(
128 cmd,
129 input=content_with_footer.encode("utf-8"),
130 capture_output=True,
131 check=True,
132 )
134 odt_bytes = result.stdout
136 if not odt_bytes: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 raise RuntimeError(
138 "Pandoc conversion failed - no output produced"
139 )
141 filename = self._generate_safe_filename(options.title)
143 logger.info(
144 f"Generated ODT in memory, size: {len(odt_bytes)} bytes"
145 )
147 return ExportResult(
148 content=odt_bytes,
149 filename=filename,
150 mimetype=self.mimetype,
151 )
153 except subprocess.CalledProcessError as e:
154 stderr = (
155 e.stderr.decode("utf-8", errors="replace")
156 if e.stderr
157 else "unknown error"
158 )
159 logger.exception(f"Pandoc conversion failed: {stderr}")
160 raise RuntimeError(f"Pandoc conversion failed: {stderr}") from e
161 except Exception:
162 logger.exception("Error generating ODT")
163 raise
165 def _add_footer(self, markdown_content: str) -> str:
166 """Add LDR attribution footer to markdown content.
168 Args:
169 markdown_content: The original markdown text
171 Returns:
172 Markdown with footer appended
173 """
174 footer = (
175 "\n\n---\n\n"
176 "*Generated by [LDR - Local Deep Research]"
177 "(https://github.com/LearningCircuit/local-deep-research) | "
178 "Open Source AI Research Assistant*"
179 )
180 return markdown_content + footer
182 def _sanitize_metadata(self, value: str) -> str:
183 """Sanitize metadata value to prevent argument injection.
185 Removes potential pandoc argument injection patterns from user-supplied
186 metadata values.
188 Args:
189 value: The metadata value to sanitize
191 Returns:
192 Sanitized metadata value safe for pandoc arguments
193 """
194 # Remove potential argument injection patterns
195 safe_value = value.replace("--", "").replace("\n", " ")
196 return safe_value