Coverage for src / local_deep_research / document_loaders / yaml_loader.py: 100%

44 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Custom YAML document loader. 

3 

4LangChain doesn't have a dedicated YAML loader, so we provide one here. 

5This loader reads YAML files and converts them to readable text documents. 

6""" 

7 

8from pathlib import Path 

9from typing import Iterator 

10 

11import yaml 

12from langchain_core.document_loaders import BaseLoader 

13from langchain_core.documents import Document 

14from loguru import logger 

15 

16 

17class YAMLLoader(BaseLoader): 

18 """ 

19 Load YAML files and convert to LangChain Documents. 

20 

21 The YAML content is rendered as formatted YAML text for human readability 

22 and semantic search. Complex structures are preserved in their YAML format. 

23 """ 

24 

25 def __init__(self, file_path: str | Path, encoding: str = "utf-8"): 

26 """ 

27 Initialize the YAML loader. 

28 

29 Args: 

30 file_path: Path to the YAML file 

31 encoding: File encoding (default: utf-8) 

32 """ 

33 self.file_path = Path(file_path) 

34 self.encoding = encoding 

35 

36 def lazy_load(self) -> Iterator[Document]: 

37 """ 

38 Lazily load documents from the YAML file. 

39 

40 Yields: 

41 Document objects containing the YAML content as text 

42 """ 

43 try: 

44 with open(self.file_path, encoding=self.encoding) as f: 

45 content = f.read() 

46 

47 # Parse YAML to validate and normalize 

48 try: 

49 data = yaml.safe_load(content) 

50 except yaml.YAMLError: 

51 logger.exception(f"Invalid YAML in {self.file_path}") 

52 # Still yield the raw content if parsing fails 

53 yield Document( 

54 page_content=content, 

55 metadata={ 

56 "source": str(self.file_path), 

57 "parse_error": True, 

58 }, 

59 ) 

60 return 

61 

62 # Skip empty YAML documents 

63 if data is None: 

64 logger.debug(f"Skipping empty YAML document: {self.file_path}") 

65 return 

66 

67 # Convert back to formatted YAML for readability 

68 # Use default_flow_style=False for block-style output 

69 text = yaml.dump( 

70 data, 

71 default_flow_style=False, 

72 allow_unicode=True, 

73 sort_keys=False, 

74 ) 

75 

76 yield Document( 

77 page_content=text, 

78 metadata={ 

79 "source": str(self.file_path), 

80 "file_type": "yaml", 

81 }, 

82 ) 

83 

84 except Exception: 

85 logger.exception(f"Error loading YAML file: {self.file_path}") 

86 raise 

87 

88 def load(self) -> list[Document]: 

89 """Load all documents from the YAML file.""" 

90 return list(self.lazy_load()) 

91 

92 

93def extract_text_from_yaml(content: bytes, encoding: str = "utf-8") -> str: 

94 """ 

95 Extract text from YAML bytes content. 

96 

97 This is a convenience function for use with bytes (e.g., from file uploads). 

98 

99 Args: 

100 content: YAML content as bytes 

101 encoding: Text encoding (default: utf-8) 

102 

103 Returns: 

104 Formatted YAML text 

105 """ 

106 try: 

107 text = content.decode(encoding) 

108 except UnicodeDecodeError: 

109 text = content.decode(encoding, errors="ignore") 

110 logger.warning( 

111 "YAML content had encoding errors, some characters may be lost" 

112 ) 

113 

114 try: 

115 data = yaml.safe_load(text) 

116 # Skip empty YAML documents 

117 if data is None: 

118 return "" 

119 return yaml.dump( 

120 data, 

121 default_flow_style=False, 

122 allow_unicode=True, 

123 sort_keys=False, 

124 ) 

125 except yaml.YAMLError: 

126 logger.exception("Invalid YAML content") 

127 # Return raw text if parsing fails 

128 return text