Coverage for src / local_deep_research / embeddings / splitters / text_splitter_registry.py: 13%
38 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Central registry for text splitters.
4This module provides a factory function to create different types of text splitters
5based on configuration, similar to how embeddings_config.py works for embeddings.
6"""
8from typing import Optional, List, Any
9from langchain_text_splitters import (
10 RecursiveCharacterTextSplitter,
11 TokenTextSplitter,
12 SentenceTransformersTokenTextSplitter,
13)
14from langchain_core.embeddings import Embeddings
15from loguru import logger
17# Valid splitter type options
18VALID_SPLITTER_TYPES = [
19 "recursive",
20 "token",
21 "sentence",
22 "semantic",
23]
26def get_text_splitter(
27 splitter_type: str = "recursive",
28 chunk_size: int = 1000,
29 chunk_overlap: int = 200,
30 text_separators: Optional[List[str]] = None,
31 embeddings: Optional[Embeddings] = None,
32 **kwargs,
33) -> Any:
34 """
35 Get text splitter based on type.
37 Args:
38 splitter_type: Type of splitter ('recursive', 'token', 'sentence', 'semantic')
39 chunk_size: Maximum size of chunks
40 chunk_overlap: Overlap between chunks
41 text_separators: Custom separators (only used for 'recursive' type)
42 embeddings: Embeddings instance (required for 'semantic' type)
43 **kwargs: Additional splitter-specific parameters
45 Returns:
46 A text splitter instance
48 Raises:
49 ValueError: If splitter_type is invalid or required parameters are missing
50 ImportError: If required dependencies are not installed
51 """
52 # Normalize splitter type
53 splitter_type = splitter_type.strip().lower()
55 # Validate splitter type
56 if splitter_type not in VALID_SPLITTER_TYPES:
57 logger.error(f"Invalid splitter type: {splitter_type}")
58 raise ValueError(
59 f"Invalid splitter type: {splitter_type}. "
60 f"Must be one of: {VALID_SPLITTER_TYPES}"
61 )
63 logger.info(
64 f"Creating text splitter: type={splitter_type}, "
65 f"chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"
66 )
68 # Create the appropriate splitter
69 if splitter_type == "token":
70 return TokenTextSplitter(
71 chunk_size=chunk_size,
72 chunk_overlap=chunk_overlap,
73 )
75 elif splitter_type == "sentence":
76 return SentenceTransformersTokenTextSplitter(
77 chunk_overlap=chunk_overlap,
78 tokens_per_chunk=chunk_size,
79 )
81 elif splitter_type == "semantic":
82 # Semantic chunking requires embeddings
83 if embeddings is None:
84 raise ValueError(
85 "Semantic splitter requires 'embeddings' parameter. "
86 "Please provide an embeddings instance."
87 )
89 try:
90 # Try to import experimental semantic chunker
91 from langchain_experimental.text_splitter import SemanticChunker
93 # Get breakpoint threshold from kwargs or use default
94 breakpoint_threshold_type = kwargs.get(
95 "breakpoint_threshold_type", "percentile"
96 )
97 breakpoint_threshold_amount = kwargs.get(
98 "breakpoint_threshold_amount", None
99 )
101 # Create semantic chunker
102 chunker_kwargs = {"embeddings": embeddings}
104 if breakpoint_threshold_type:
105 chunker_kwargs["breakpoint_threshold_type"] = (
106 breakpoint_threshold_type
107 )
109 if breakpoint_threshold_amount is not None:
110 chunker_kwargs["breakpoint_threshold_amount"] = (
111 breakpoint_threshold_amount
112 )
114 logger.info(
115 f"Creating SemanticChunker with threshold_type={breakpoint_threshold_type}, "
116 f"threshold_amount={breakpoint_threshold_amount}"
117 )
119 return SemanticChunker(**chunker_kwargs)
121 except ImportError as e:
122 logger.exception("Failed to import SemanticChunker")
123 raise ImportError(
124 "Semantic chunking requires langchain-experimental. "
125 "Install it with: pip install langchain-experimental"
126 ) from e
128 else: # "recursive" or default
129 # Use custom separators if provided, otherwise use defaults
130 if text_separators is None:
131 text_separators = ["\n\n", "\n", ". ", " ", ""]
133 return RecursiveCharacterTextSplitter(
134 chunk_size=chunk_size,
135 chunk_overlap=chunk_overlap,
136 length_function=len,
137 separators=text_separators,
138 )
141def is_semantic_chunker_available() -> bool:
142 """Check if semantic chunking is available."""
143 import importlib.util
145 return importlib.util.find_spec("langchain_experimental") is not None