Coverage for src / local_deep_research / exporters / base.py: 93%
44 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Base classes for document exporters.
3This module provides the abstract base class and data structures that all
4exporters must implement to participate in the export system.
5"""
7import re
8from abc import ABC, abstractmethod
9from dataclasses import dataclass, field
10from typing import Any, Dict, Optional
13@dataclass
14class ExportResult:
15 """Result of an export operation."""
17 content: bytes
18 filename: str
19 mimetype: str
22@dataclass
23class ExportOptions:
24 """Common options for all exporters.
26 Attributes:
27 title: Optional document title
28 metadata: Optional metadata dict (author, date, etc.)
29 custom_options: Format-specific options (e.g., custom_css for PDF)
30 """
32 title: Optional[str] = None
33 metadata: Optional[Dict[str, Any]] = None
34 custom_options: Optional[Dict[str, Any]] = field(default_factory=dict)
37class BaseExporter(ABC):
38 """Abstract base class for document exporters.
40 All exporters must inherit from this class and implement the required
41 abstract methods to participate in the export registry.
43 Example:
44 class MyExporter(BaseExporter):
45 @property
46 def format_name(self) -> str:
47 return "myformat"
49 @property
50 def file_extension(self) -> str:
51 return ".myf"
53 @property
54 def mimetype(self) -> str:
55 return "application/x-myformat"
57 def export(self, markdown_content, options=None) -> ExportResult:
58 # Implementation here
59 ...
60 """
62 # Maximum content size (50 MB) to prevent OOM errors
63 MAX_CONTENT_SIZE = 50 * 1024 * 1024
65 @property
66 @abstractmethod
67 def format_name(self) -> str:
68 """Return the format identifier (e.g., 'pdf', 'odt', 'latex').
70 This is used to look up the exporter in the registry.
71 """
72 pass
74 @property
75 @abstractmethod
76 def file_extension(self) -> str:
77 """Return the file extension including the dot (e.g., '.pdf', '.odt')."""
78 pass
80 @property
81 @abstractmethod
82 def mimetype(self) -> str:
83 """Return the MIME type for the exported file."""
84 pass
86 def _validate_content_size(self, content: str) -> None:
87 """Validate that content does not exceed the maximum size limit.
89 Args:
90 content: The content string to validate
92 Raises:
93 ValueError: If content exceeds MAX_CONTENT_SIZE
94 """
95 if len(content) > self.MAX_CONTENT_SIZE:
96 raise ValueError(
97 f"Content exceeds maximum size of "
98 f"{self.MAX_CONTENT_SIZE // (1024 * 1024)} MB"
99 )
101 @abstractmethod
102 def export(
103 self,
104 markdown_content: str,
105 options: Optional[ExportOptions] = None,
106 ) -> ExportResult:
107 """Export markdown content to the target format.
109 Args:
110 markdown_content: The markdown text to convert
111 options: Optional export options
113 Returns:
114 ExportResult with content bytes, filename, and mimetype
115 """
116 pass
118 def _generate_safe_filename(self, title: Optional[str]) -> str:
119 """Generate a safe filename from the title.
121 Args:
122 title: Optional title to use in the filename
124 Returns:
125 A sanitized filename with the appropriate extension
126 """
127 if title:
128 safe_title = (
129 re.sub(r"[^\w\s-]", "", title).strip().replace(" ", "_")[:50]
130 )
131 else:
132 safe_title = "research_report"
133 return f"{safe_title}{self.file_extension}"
135 def _prepend_title_if_needed(
136 self, content: str, title: Optional[str]
137 ) -> str:
138 """Prepend title as H1 heading if content doesn't already have one.
140 This method is used by exporters that render markdown documents
141 (like PDF and ODT) to ensure the title appears in the output.
142 Exporters that don't render documents (like RIS) should not use this.
144 Args:
145 content: The markdown content
146 title: Optional title to prepend
148 Returns:
149 Content with title prepended if needed, otherwise unchanged
150 """
151 if not title:
152 return content
153 # Don't prepend if content already starts with this title
154 if content.startswith(f"# {title}"):
155 return content
156 # Only prepend if content doesn't start with any heading
157 if not content.lstrip().startswith("#"):
158 return f"# {title}\n\n{content}"
159 return content