Coverage for src / local_deep_research / exporters / odt_exporter.py: 91%
63 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""ODT export service using pypandoc.
3This module provides the ODTExporter class for converting markdown content
4to OpenDocument Text (ODT) format using pypandoc (Pandoc wrapper).
6Pandoc is the industry standard for document conversion and handles all
7markdown features natively. Note: Requires Pandoc to be installed on the
8system or use pypandoc_binary which bundles it.
9"""
11import subprocess
12from typing import Optional
14from loguru import logger
16# pypandoc is optional - used only to locate the bundled pandoc binary
17try:
18 import pypandoc
20 PYPANDOC_AVAILABLE = True
21except ImportError:
22 pypandoc = None
23 PYPANDOC_AVAILABLE = False
25from .base import BaseExporter, ExportOptions, ExportResult
26from .registry import ExporterRegistry
29@ExporterRegistry.register
30class ODTExporter(BaseExporter):
31 """Service for converting markdown to ODT using pypandoc.
33 This exporter uses Pandoc (via pypandoc) to convert markdown content
34 to OpenDocument Text format, which can be opened in LibreOffice Writer,
35 Microsoft Word, and other office applications.
37 Pandoc is the industry standard for document conversion and handles
38 all markdown features (tables, code blocks, lists, etc.) natively.
40 The conversion is performed entirely in memory by piping markdown
41 to pandoc's stdin and capturing ODT output from stdout.
42 """
44 @property
45 def format_name(self) -> str:
46 return "odt"
48 @property
49 def file_extension(self) -> str:
50 return ".odt"
52 @property
53 def mimetype(self) -> str:
54 return "application/vnd.oasis.opendocument.text"
56 def export(
57 self,
58 markdown_content: str,
59 options: Optional[ExportOptions] = None,
60 ) -> ExportResult:
61 """Convert markdown content to ODT using Pandoc.
63 The conversion runs entirely in memory: markdown is piped to
64 pandoc's stdin and ODT bytes are captured from stdout, avoiding
65 any temporary files on disk.
67 Args:
68 markdown_content: The markdown text to convert
69 options: Optional export options (title, metadata)
71 Returns:
72 ExportResult with ODT content as bytes, filename, and mimetype
74 Raises:
75 ValueError: If content exceeds maximum size limit
76 RuntimeError: If Pandoc conversion fails
77 """
78 # Check if pypandoc is available
79 if not PYPANDOC_AVAILABLE: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 raise RuntimeError(
81 "ODT export requires pypandoc. Install with: pip install pypandoc-binary"
82 )
84 try:
85 # Check content size limit to prevent OOM errors
86 self._validate_content_size(markdown_content)
88 options = options or ExportOptions()
90 # Prepend title if needed (for document formats like ODT)
91 markdown_content = self._prepend_title_if_needed(
92 markdown_content, options.title
93 )
95 # Add LDR attribution footer
96 content_with_footer = self._add_footer(markdown_content)
98 # Build pandoc args for metadata (sanitized)
99 extra_args = []
100 if options.title:
101 safe_title = self._sanitize_metadata(options.title)
102 extra_args.append(f"--metadata=title:{safe_title}")
103 if options.metadata:
104 if options.metadata.get("author"):
105 safe_author = self._sanitize_metadata(
106 options.metadata["author"]
107 )
108 extra_args.append(f"--metadata=author:{safe_author}")
109 if options.metadata.get("date"):
110 safe_date = self._sanitize_metadata(
111 options.metadata["date"]
112 )
113 extra_args.append(f"--metadata=date:{safe_date}")
115 # Convert in memory: pipe markdown via stdin, capture ODT from stdout
116 pandoc_path = pypandoc.get_pandoc_path()
117 cmd = [pandoc_path, "-f", "markdown", "-t", "odt", "-o", "-"]
118 cmd.extend(extra_args)
120 result = subprocess.run(
121 cmd,
122 input=content_with_footer.encode("utf-8"),
123 capture_output=True,
124 check=True,
125 )
127 odt_bytes = result.stdout
129 if not odt_bytes: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 raise RuntimeError( # noqa: TRY301 — except only adds logging before re-raise
131 "Pandoc conversion failed - no output produced"
132 )
134 filename = self._generate_safe_filename(options.title)
136 logger.info(
137 f"Generated ODT in memory, size: {len(odt_bytes)} bytes"
138 )
140 return ExportResult(
141 content=odt_bytes,
142 filename=filename,
143 mimetype=self.mimetype,
144 )
146 except subprocess.CalledProcessError as e:
147 stderr = (
148 e.stderr.decode("utf-8", errors="replace")
149 if e.stderr
150 else "unknown error"
151 )
152 logger.exception(f"Pandoc conversion failed: {stderr}")
153 raise RuntimeError(f"Pandoc conversion failed: {stderr}") from e
154 except Exception:
155 logger.exception("Error generating ODT")
156 raise
158 def _add_footer(self, markdown_content: str) -> str:
159 """Add LDR attribution footer to markdown content.
161 Args:
162 markdown_content: The original markdown text
164 Returns:
165 Markdown with footer appended
166 """
167 footer = (
168 "\n\n---\n\n"
169 "*Generated by [LDR - Local Deep Research]"
170 "(https://github.com/LearningCircuit/local-deep-research) | "
171 "Open Source AI Research Assistant*"
172 )
173 return markdown_content + footer
175 def _sanitize_metadata(self, value: str) -> str:
176 """Sanitize metadata value to prevent argument injection.
178 Removes potential pandoc argument injection patterns from user-supplied
179 metadata values.
181 Args:
182 value: The metadata value to sanitize
184 Returns:
185 Sanitized metadata value safe for pandoc arguments
186 """
187 # Remove potential argument injection patterns
188 return value.replace("--", "").replace("\n", " ")