Coverage for src / local_deep_research / exporters / base.py: 93%

44 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Base classes for document exporters. 

2 

3This module provides the abstract base class and data structures that all 

4exporters must implement to participate in the export system. 

5""" 

6 

7import re 

8from abc import ABC, abstractmethod 

9from dataclasses import dataclass, field 

10from typing import Any, Dict, Optional 

11 

12 

13@dataclass 

14class ExportResult: 

15 """Result of an export operation.""" 

16 

17 content: bytes 

18 filename: str 

19 mimetype: str 

20 

21 

22@dataclass 

23class ExportOptions: 

24 """Common options for all exporters. 

25 

26 Attributes: 

27 title: Optional document title 

28 metadata: Optional metadata dict (author, date, etc.) 

29 custom_options: Format-specific options (e.g., custom_css for PDF) 

30 """ 

31 

32 title: Optional[str] = None 

33 metadata: Optional[Dict[str, Any]] = None 

34 custom_options: Optional[Dict[str, Any]] = field(default_factory=dict) 

35 

36 

37class BaseExporter(ABC): 

38 """Abstract base class for document exporters. 

39 

40 All exporters must inherit from this class and implement the required 

41 abstract methods to participate in the export registry. 

42 

43 Example: 

44 class MyExporter(BaseExporter): 

45 @property 

46 def format_name(self) -> str: 

47 return "myformat" 

48 

49 @property 

50 def file_extension(self) -> str: 

51 return ".myf" 

52 

53 @property 

54 def mimetype(self) -> str: 

55 return "application/x-myformat" 

56 

57 def export(self, markdown_content, options=None) -> ExportResult: 

58 # Implementation here 

59 ... 

60 """ 

61 

62 # Maximum content size (50 MB) to prevent OOM errors 

63 MAX_CONTENT_SIZE = 50 * 1024 * 1024 

64 

65 @property 

66 @abstractmethod 

67 def format_name(self) -> str: 

68 """Return the format identifier (e.g., 'pdf', 'odt', 'latex'). 

69 

70 This is used to look up the exporter in the registry. 

71 """ 

72 pass 

73 

74 @property 

75 @abstractmethod 

76 def file_extension(self) -> str: 

77 """Return the file extension including the dot (e.g., '.pdf', '.odt').""" 

78 pass 

79 

80 @property 

81 @abstractmethod 

82 def mimetype(self) -> str: 

83 """Return the MIME type for the exported file.""" 

84 pass 

85 

86 def _validate_content_size(self, content: str) -> None: 

87 """Validate that content does not exceed the maximum size limit. 

88 

89 Args: 

90 content: The content string to validate 

91 

92 Raises: 

93 ValueError: If content exceeds MAX_CONTENT_SIZE 

94 """ 

95 if len(content) > self.MAX_CONTENT_SIZE: 

96 raise ValueError( 

97 f"Content exceeds maximum size of " 

98 f"{self.MAX_CONTENT_SIZE // (1024 * 1024)} MB" 

99 ) 

100 

101 @abstractmethod 

102 def export( 

103 self, 

104 markdown_content: str, 

105 options: Optional[ExportOptions] = None, 

106 ) -> ExportResult: 

107 """Export markdown content to the target format. 

108 

109 Args: 

110 markdown_content: The markdown text to convert 

111 options: Optional export options 

112 

113 Returns: 

114 ExportResult with content bytes, filename, and mimetype 

115 """ 

116 pass 

117 

118 def _generate_safe_filename(self, title: Optional[str]) -> str: 

119 """Generate a safe filename from the title. 

120 

121 Args: 

122 title: Optional title to use in the filename 

123 

124 Returns: 

125 A sanitized filename with the appropriate extension 

126 """ 

127 if title: 

128 safe_title = ( 

129 re.sub(r"[^\w\s-]", "", title).strip().replace(" ", "_")[:50] 

130 ) 

131 else: 

132 safe_title = "research_report" 

133 return f"{safe_title}{self.file_extension}" 

134 

135 def _prepend_title_if_needed( 

136 self, content: str, title: Optional[str] 

137 ) -> str: 

138 """Prepend title as H1 heading if content doesn't already have one. 

139 

140 This method is used by exporters that render markdown documents 

141 (like PDF and ODT) to ensure the title appears in the output. 

142 Exporters that don't render documents (like RIS) should not use this. 

143 

144 Args: 

145 content: The markdown content 

146 title: Optional title to prepend 

147 

148 Returns: 

149 Content with title prepended if needed, otherwise unchanged 

150 """ 

151 if not title: 

152 return content 

153 # Don't prepend if content already starts with this title 

154 if content.startswith(f"# {title}"): 

155 return content 

156 # Only prepend if content doesn't start with any heading 

157 if not content.lstrip().startswith("#"): 

158 return f"# {title}\n\n{content}" 

159 return content