Coverage for src / local_deep_research / document_loaders / yaml_loader.py: 100%
44 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Custom YAML document loader.
4LangChain doesn't have a dedicated YAML loader, so we provide one here.
5This loader reads YAML files and converts them to readable text documents.
6"""
8from pathlib import Path
9from typing import Iterator
11import yaml
12from langchain_core.document_loaders import BaseLoader
13from langchain_core.documents import Document
14from loguru import logger
17class YAMLLoader(BaseLoader):
18 """
19 Load YAML files and convert to LangChain Documents.
21 The YAML content is rendered as formatted YAML text for human readability
22 and semantic search. Complex structures are preserved in their YAML format.
23 """
25 def __init__(self, file_path: str | Path, encoding: str = "utf-8"):
26 """
27 Initialize the YAML loader.
29 Args:
30 file_path: Path to the YAML file
31 encoding: File encoding (default: utf-8)
32 """
33 self.file_path = Path(file_path)
34 self.encoding = encoding
36 def lazy_load(self) -> Iterator[Document]:
37 """
38 Lazily load documents from the YAML file.
40 Yields:
41 Document objects containing the YAML content as text
42 """
43 try:
44 with open(self.file_path, encoding=self.encoding) as f:
45 content = f.read()
47 # Parse YAML to validate and normalize
48 try:
49 data = yaml.safe_load(content)
50 except yaml.YAMLError:
51 logger.exception(f"Invalid YAML in {self.file_path}")
52 # Still yield the raw content if parsing fails
53 yield Document(
54 page_content=content,
55 metadata={
56 "source": str(self.file_path),
57 "parse_error": True,
58 },
59 )
60 return
62 # Skip empty YAML documents
63 if data is None:
64 logger.debug(f"Skipping empty YAML document: {self.file_path}")
65 return
67 # Convert back to formatted YAML for readability
68 # Use default_flow_style=False for block-style output
69 text = yaml.dump(
70 data,
71 default_flow_style=False,
72 allow_unicode=True,
73 sort_keys=False,
74 )
76 yield Document(
77 page_content=text,
78 metadata={
79 "source": str(self.file_path),
80 "file_type": "yaml",
81 },
82 )
84 except Exception:
85 logger.exception(f"Error loading YAML file: {self.file_path}")
86 raise
88 def load(self) -> list[Document]:
89 """Load all documents from the YAML file."""
90 return list(self.lazy_load())
93def extract_text_from_yaml(content: bytes, encoding: str = "utf-8") -> str:
94 """
95 Extract text from YAML bytes content.
97 This is a convenience function for use with bytes (e.g., from file uploads).
99 Args:
100 content: YAML content as bytes
101 encoding: Text encoding (default: utf-8)
103 Returns:
104 Formatted YAML text
105 """
106 try:
107 text = content.decode(encoding)
108 except UnicodeDecodeError:
109 text = content.decode(encoding, errors="ignore")
110 logger.warning(
111 "YAML content had encoding errors, some characters may be lost"
112 )
114 try:
115 data = yaml.safe_load(text)
116 # Skip empty YAML documents
117 if data is None:
118 return ""
119 return yaml.dump(
120 data,
121 default_flow_style=False,
122 allow_unicode=True,
123 sort_keys=False,
124 )
125 except yaml.YAMLError:
126 logger.exception("Invalid YAML content")
127 # Return raw text if parsing fails
128 return text