Coverage for src / local_deep_research / embeddings / splitters / text_splitter_registry.py: 13%

38 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Central registry for text splitters. 

3 

4This module provides a factory function to create different types of text splitters 

5based on configuration, similar to how embeddings_config.py works for embeddings. 

6""" 

7 

8from typing import Optional, List, Any 

9from langchain_text_splitters import ( 

10 RecursiveCharacterTextSplitter, 

11 TokenTextSplitter, 

12 SentenceTransformersTokenTextSplitter, 

13) 

14from langchain_core.embeddings import Embeddings 

15from loguru import logger 

16 

17# Valid splitter type options 

18VALID_SPLITTER_TYPES = [ 

19 "recursive", 

20 "token", 

21 "sentence", 

22 "semantic", 

23] 

24 

25 

26def get_text_splitter( 

27 splitter_type: str = "recursive", 

28 chunk_size: int = 1000, 

29 chunk_overlap: int = 200, 

30 text_separators: Optional[List[str]] = None, 

31 embeddings: Optional[Embeddings] = None, 

32 **kwargs, 

33) -> Any: 

34 """ 

35 Get text splitter based on type. 

36 

37 Args: 

38 splitter_type: Type of splitter ('recursive', 'token', 'sentence', 'semantic') 

39 chunk_size: Maximum size of chunks 

40 chunk_overlap: Overlap between chunks 

41 text_separators: Custom separators (only used for 'recursive' type) 

42 embeddings: Embeddings instance (required for 'semantic' type) 

43 **kwargs: Additional splitter-specific parameters 

44 

45 Returns: 

46 A text splitter instance 

47 

48 Raises: 

49 ValueError: If splitter_type is invalid or required parameters are missing 

50 ImportError: If required dependencies are not installed 

51 """ 

52 # Normalize splitter type 

53 splitter_type = splitter_type.strip().lower() 

54 

55 # Validate splitter type 

56 if splitter_type not in VALID_SPLITTER_TYPES: 

57 logger.error(f"Invalid splitter type: {splitter_type}") 

58 raise ValueError( 

59 f"Invalid splitter type: {splitter_type}. " 

60 f"Must be one of: {VALID_SPLITTER_TYPES}" 

61 ) 

62 

63 logger.info( 

64 f"Creating text splitter: type={splitter_type}, " 

65 f"chunk_size={chunk_size}, chunk_overlap={chunk_overlap}" 

66 ) 

67 

68 # Create the appropriate splitter 

69 if splitter_type == "token": 

70 return TokenTextSplitter( 

71 chunk_size=chunk_size, 

72 chunk_overlap=chunk_overlap, 

73 ) 

74 

75 elif splitter_type == "sentence": 

76 return SentenceTransformersTokenTextSplitter( 

77 chunk_overlap=chunk_overlap, 

78 tokens_per_chunk=chunk_size, 

79 ) 

80 

81 elif splitter_type == "semantic": 

82 # Semantic chunking requires embeddings 

83 if embeddings is None: 

84 raise ValueError( 

85 "Semantic splitter requires 'embeddings' parameter. " 

86 "Please provide an embeddings instance." 

87 ) 

88 

89 try: 

90 # Try to import experimental semantic chunker 

91 from langchain_experimental.text_splitter import SemanticChunker 

92 

93 # Get breakpoint threshold from kwargs or use default 

94 breakpoint_threshold_type = kwargs.get( 

95 "breakpoint_threshold_type", "percentile" 

96 ) 

97 breakpoint_threshold_amount = kwargs.get( 

98 "breakpoint_threshold_amount", None 

99 ) 

100 

101 # Create semantic chunker 

102 chunker_kwargs = {"embeddings": embeddings} 

103 

104 if breakpoint_threshold_type: 

105 chunker_kwargs["breakpoint_threshold_type"] = ( 

106 breakpoint_threshold_type 

107 ) 

108 

109 if breakpoint_threshold_amount is not None: 

110 chunker_kwargs["breakpoint_threshold_amount"] = ( 

111 breakpoint_threshold_amount 

112 ) 

113 

114 logger.info( 

115 f"Creating SemanticChunker with threshold_type={breakpoint_threshold_type}, " 

116 f"threshold_amount={breakpoint_threshold_amount}" 

117 ) 

118 

119 return SemanticChunker(**chunker_kwargs) 

120 

121 except ImportError as e: 

122 logger.exception("Failed to import SemanticChunker") 

123 raise ImportError( 

124 "Semantic chunking requires langchain-experimental. " 

125 "Install it with: pip install langchain-experimental" 

126 ) from e 

127 

128 else: # "recursive" or default 

129 # Use custom separators if provided, otherwise use defaults 

130 if text_separators is None: 

131 text_separators = ["\n\n", "\n", ". ", " ", ""] 

132 

133 return RecursiveCharacterTextSplitter( 

134 chunk_size=chunk_size, 

135 chunk_overlap=chunk_overlap, 

136 length_function=len, 

137 separators=text_separators, 

138 ) 

139 

140 

141def is_semantic_chunker_available() -> bool: 

142 """Check if semantic chunking is available.""" 

143 import importlib.util 

144 

145 return importlib.util.find_spec("langchain_experimental") is not None