Coverage for src/local_deep_research/embeddings/splitters/text_splitter

1"""

2Central registry for text splitters.

4This module provides a factory function to create different types of text splitters

5based on configuration, similar to how embeddings_config.py works for embeddings.

6"""

8from typing import Optional, List, Any

9from langchain_text_splitters import (

10 RecursiveCharacterTextSplitter,

11 TokenTextSplitter,

12 SentenceTransformersTokenTextSplitter,

13)

14from langchain_core.embeddings import Embeddings

15from loguru import logger

17# Valid splitter type options

18VALID_SPLITTER_TYPES = [

19 "recursive",

20 "token",

21 "sentence",

22 "semantic",

23]

26def get_text_splitter(

27 splitter_type: str = "recursive",

28 chunk_size: int = 1000,

29 chunk_overlap: int = 200,

30 text_separators: Optional[List[str]] = None,

31 embeddings: Optional[Embeddings] = None,

32 **kwargs,

33) -> Any:

34 """

35 Get text splitter based on type.

37 Args:

38 splitter_type: Type of splitter ('recursive', 'token', 'sentence', 'semantic')

39 chunk_size: Maximum size of chunks

40 chunk_overlap: Overlap between chunks

41 text_separators: Custom separators (only used for 'recursive' type)

42 embeddings: Embeddings instance (required for 'semantic' type)

43 **kwargs: Additional splitter-specific parameters

45 Returns:

46 A text splitter instance

48 Raises:

49 ValueError: If splitter_type is invalid or required parameters are missing

50 ImportError: If required dependencies are not installed

51 """

52 # Normalize splitter type

53 splitter_type = splitter_type.strip().lower()

55 # Validate splitter type

56 if splitter_type not in VALID_SPLITTER_TYPES:

57 logger.error(f"Invalid splitter type: {splitter_type}")

58 raise ValueError(

59 f"Invalid splitter type: {splitter_type}. "

60 f"Must be one of: {VALID_SPLITTER_TYPES}"

61 )

63 logger.info(

64 f"Creating text splitter: type={splitter_type}, "

65 f"chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"

66 )

68 # Create the appropriate splitter

69 if splitter_type == "token":

70 return TokenTextSplitter(

71 chunk_size=chunk_size,

72 chunk_overlap=chunk_overlap,

73 )

75 elif splitter_type == "sentence":

76 return SentenceTransformersTokenTextSplitter(

77 chunk_overlap=chunk_overlap,

78 tokens_per_chunk=chunk_size,

79 )

81 elif splitter_type == "semantic":

82 # Semantic chunking requires embeddings

83 if embeddings is None:

84 raise ValueError(

85 "Semantic splitter requires 'embeddings' parameter. "

86 "Please provide an embeddings instance."

87 )

89 try:

90 # Try to import experimental semantic chunker

91 from langchain_experimental.text_splitter import SemanticChunker

93 # Get breakpoint threshold from kwargs or use default

94 breakpoint_threshold_type = kwargs.get(

95 "breakpoint_threshold_type", "percentile"

96 )

97 breakpoint_threshold_amount = kwargs.get(

98 "breakpoint_threshold_amount", None

99 )

100

101 # Create semantic chunker

102 chunker_kwargs = {"embeddings": embeddings}

103

104 if breakpoint_threshold_type: 104 ↛ 109line 104 didn't jump to line 109 because the condition on line 104 was always true

105 chunker_kwargs["breakpoint_threshold_type"] = (

106 breakpoint_threshold_type

107 )

108

109 if breakpoint_threshold_amount is not None:

110 chunker_kwargs["breakpoint_threshold_amount"] = (

111 breakpoint_threshold_amount

112 )

113

114 logger.info(

115 f"Creating SemanticChunker with threshold_type={breakpoint_threshold_type}, "

116 f"threshold_amount={breakpoint_threshold_amount}"

117 )

118

119 return SemanticChunker(**chunker_kwargs)

120

121 except ImportError as e:

122 logger.exception("Failed to import SemanticChunker")

123 raise ImportError(

124 "Semantic chunking requires langchain-experimental. "

125 "Install it with: pip install langchain-experimental"

126 ) from e

127

128 else: # "recursive" or default

129 # Use custom separators if provided, otherwise use defaults

130 if text_separators is None:

131 text_separators = ["\n\n", "\n", ". ", " ", ""]

132

133 return RecursiveCharacterTextSplitter(

134 chunk_size=chunk_size,

135 chunk_overlap=chunk_overlap,

136 length_function=len,

137 separators=text_separators,

138 )

139

140

141def is_semantic_chunker_available() -> bool:

142 """Check if semantic chunking is available."""

143 import importlib.util

144

145 return importlib.util.find_spec("langchain_experimental") is not None

Coverage for src / local_deep_research / embeddings / splitters / text_splitter_registry.py: 98%

38 statements