Coverage for src / local_deep_research / exporters / base.py: 92%

40 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""Base classes for document exporters. 

2 

3This module provides the abstract base class and data structures that all 

4exporters must implement to participate in the export system. 

5""" 

6 

7import re 

8from abc import ABC, abstractmethod 

9from dataclasses import dataclass, field 

10from typing import Any, Dict, Optional 

11 

12 

13@dataclass 

14class ExportResult: 

15 """Result of an export operation.""" 

16 

17 content: bytes 

18 filename: str 

19 mimetype: str 

20 

21 

22@dataclass 

23class ExportOptions: 

24 """Common options for all exporters. 

25 

26 Attributes: 

27 title: Optional document title 

28 metadata: Optional metadata dict (author, date, etc.) 

29 custom_options: Format-specific options (e.g., custom_css for PDF) 

30 """ 

31 

32 title: Optional[str] = None 

33 metadata: Optional[Dict[str, Any]] = None 

34 custom_options: Optional[Dict[str, Any]] = field(default_factory=dict) 

35 

36 

37class BaseExporter(ABC): 

38 """Abstract base class for document exporters. 

39 

40 All exporters must inherit from this class and implement the required 

41 abstract methods to participate in the export registry. 

42 

43 Example: 

44 class MyExporter(BaseExporter): 

45 @property 

46 def format_name(self) -> str: 

47 return "myformat" 

48 

49 @property 

50 def file_extension(self) -> str: 

51 return ".myf" 

52 

53 @property 

54 def mimetype(self) -> str: 

55 return "application/x-myformat" 

56 

57 def export(self, markdown_content, options=None) -> ExportResult: 

58 # Implementation here 

59 ... 

60 """ 

61 

62 @property 

63 @abstractmethod 

64 def format_name(self) -> str: 

65 """Return the format identifier (e.g., 'pdf', 'odt', 'latex'). 

66 

67 This is used to look up the exporter in the registry. 

68 """ 

69 pass 

70 

71 @property 

72 @abstractmethod 

73 def file_extension(self) -> str: 

74 """Return the file extension including the dot (e.g., '.pdf', '.odt').""" 

75 pass 

76 

77 @property 

78 @abstractmethod 

79 def mimetype(self) -> str: 

80 """Return the MIME type for the exported file.""" 

81 pass 

82 

83 @abstractmethod 

84 def export( 

85 self, 

86 markdown_content: str, 

87 options: Optional[ExportOptions] = None, 

88 ) -> ExportResult: 

89 """Export markdown content to the target format. 

90 

91 Args: 

92 markdown_content: The markdown text to convert 

93 options: Optional export options 

94 

95 Returns: 

96 ExportResult with content bytes, filename, and mimetype 

97 """ 

98 pass 

99 

100 def _generate_safe_filename(self, title: Optional[str]) -> str: 

101 """Generate a safe filename from the title. 

102 

103 Args: 

104 title: Optional title to use in the filename 

105 

106 Returns: 

107 A sanitized filename with the appropriate extension 

108 """ 

109 if title: 

110 safe_title = ( 

111 re.sub(r"[^\w\s-]", "", title).strip().replace(" ", "_")[:50] 

112 ) 

113 else: 

114 safe_title = "research_report" 

115 return f"{safe_title}{self.file_extension}" 

116 

117 def _prepend_title_if_needed( 

118 self, content: str, title: Optional[str] 

119 ) -> str: 

120 """Prepend title as H1 heading if content doesn't already have one. 

121 

122 This method is used by exporters that render markdown documents 

123 (like PDF and ODT) to ensure the title appears in the output. 

124 Exporters that don't render documents (like RIS) should not use this. 

125 

126 Args: 

127 content: The markdown content 

128 title: Optional title to prepend 

129 

130 Returns: 

131 Content with title prepended if needed, otherwise unchanged 

132 """ 

133 if not title: 

134 return content 

135 # Don't prepend if content already starts with this title 

136 if content.startswith(f"# {title}"): 

137 return content 

138 # Only prepend if content doesn't start with any heading 

139 if not content.lstrip().startswith("#"): 

140 return f"# {title}\n\n{content}" 

141 return content