Coverage for src / local_deep_research / research_library / services / rag_service_factory.py: 100%

43 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2RAG Service Factory 

3 

4Provides get_rag_service() for creating LibraryRAGService instances 

5with appropriate settings. Extracted from rag_routes.py to avoid 

6circular imports (service → routes). 

7""" 

8 

9import json 

10from typing import Optional 

11 

12from loguru import logger 

13 

14from ...database.models.library import Collection 

15from ...database.session_context import get_user_db_session 

16from ...utilities.db_utils import get_settings_manager 

17from ...utilities.type_utils import to_bool 

18from ..services.library_rag_service import LibraryRAGService 

19 

20 

21def get_rag_service( 

22 username: str, 

23 collection_id: Optional[str] = None, 

24 use_defaults: bool = False, 

25 db_password: Optional[str] = None, 

26) -> LibraryRAGService: 

27 """ 

28 Get RAG service instance with appropriate settings. 

29 

30 Args: 

31 username: Username for database access and settings lookup 

32 collection_id: Optional collection UUID to load stored settings from 

33 use_defaults: When True, ignore stored collection settings and use 

34 current defaults. Pass True on force-reindex so that the new 

35 default embedding model is picked up. 

36 db_password: Optional database password for encrypted databases 

37 

38 If collection_id is provided: 

39 - Uses collection's stored settings if they exist (unless use_defaults=True) 

40 - Uses current defaults for new collections (and stores them) 

41 

42 If no collection_id: 

43 - Uses current default settings 

44 """ 

45 settings = get_settings_manager(username=username) 

46 

47 # Get current default settings. 

48 # The local_search_* keys are written by the embedding-settings page and 

49 # have no JSON defaults file yet, so explicit fallbacks are required to 

50 # avoid TypeError / None propagation on fresh installs. 

51 default_embedding_model = ( 

52 settings.get_setting("local_search_embedding_model") 

53 or "all-MiniLM-L6-v2" 

54 ) 

55 default_embedding_provider = ( 

56 settings.get_setting("local_search_embedding_provider") 

57 or "sentence_transformers" 

58 ) 

59 default_chunk_size = int( 

60 settings.get_setting("local_search_chunk_size") or 1000 

61 ) 

62 default_chunk_overlap = int( 

63 settings.get_setting("local_search_chunk_overlap") or 200 

64 ) 

65 default_splitter_type = ( 

66 settings.get_setting("local_search_splitter_type") or "recursive" 

67 ) 

68 default_text_separators = settings.get_setting( 

69 "local_search_text_separators" 

70 ) 

71 # Parse JSON string to list 

72 if isinstance(default_text_separators, str): 

73 try: 

74 default_text_separators = json.loads(default_text_separators) 

75 except json.JSONDecodeError: 

76 logger.warning( 

77 "Invalid JSON for local_search_text_separators: {!r} — using default separators", 

78 default_text_separators, 

79 ) 

80 default_text_separators = ["\n\n", "\n", ". ", " ", ""] 

81 elif default_text_separators is None: 

82 default_text_separators = ["\n\n", "\n", ". ", " ", ""] 

83 default_distance_metric = ( 

84 settings.get_setting("local_search_distance_metric") or "cosine" 

85 ) 

86 default_normalize_vectors = settings.get_bool_setting( 

87 "local_search_normalize_vectors" 

88 ) 

89 default_index_type = ( 

90 settings.get_setting("local_search_index_type") or "Flat" 

91 ) 

92 

93 # If collection_id provided, check for stored settings 

94 if collection_id: 

95 with get_user_db_session(username, db_password) as db_session: 

96 collection = ( 

97 db_session.query(Collection).filter_by(id=collection_id).first() 

98 ) 

99 

100 if collection and collection.embedding_model and not use_defaults: 

101 # Use collection's stored settings 

102 logger.info( 

103 f"Using stored settings for collection {collection_id}: " 

104 f"{collection.embedding_model_type.value if collection.embedding_model_type else 'unknown'}/{collection.embedding_model}" 

105 ) 

106 # Handle normalize_vectors - may be stored as string in some cases 

107 coll_normalize = collection.normalize_vectors 

108 if coll_normalize is not None: 

109 coll_normalize = to_bool(coll_normalize) 

110 else: 

111 coll_normalize = default_normalize_vectors 

112 

113 def _col(stored, default): 

114 """Use stored collection value if not None, else default.""" 

115 return stored if stored is not None else default 

116 

117 return LibraryRAGService( 

118 username=username, 

119 embedding_model=collection.embedding_model, 

120 embedding_provider=collection.embedding_model_type.value 

121 if collection.embedding_model_type 

122 else default_embedding_provider, 

123 chunk_size=_col(collection.chunk_size, default_chunk_size), 

124 chunk_overlap=_col( 

125 collection.chunk_overlap, default_chunk_overlap 

126 ), 

127 splitter_type=_col( 

128 collection.splitter_type, default_splitter_type 

129 ), 

130 text_separators=_col( 

131 collection.text_separators, default_text_separators 

132 ), 

133 distance_metric=_col( 

134 collection.distance_metric, default_distance_metric 

135 ), 

136 normalize_vectors=coll_normalize, 

137 index_type=_col(collection.index_type, default_index_type), 

138 db_password=db_password, 

139 ) 

140 if collection: 

141 # New collection - use defaults and store them 

142 logger.info( 

143 f"New collection {collection_id}, using and storing default settings" 

144 ) 

145 

146 # Create service with defaults 

147 return LibraryRAGService( 

148 username=username, 

149 embedding_model=default_embedding_model, 

150 embedding_provider=default_embedding_provider, 

151 chunk_size=default_chunk_size, 

152 chunk_overlap=default_chunk_overlap, 

153 splitter_type=default_splitter_type, 

154 text_separators=default_text_separators, 

155 distance_metric=default_distance_metric, 

156 normalize_vectors=default_normalize_vectors, 

157 index_type=default_index_type, 

158 db_password=db_password, 

159 ) 

160 

161 # Store settings on collection (will be done during indexing) 

162 # Note: We don't store here because we don't have embedding_dimension yet 

163 # It will be stored in index_collection when first document is indexed 

164 

165 # No collection or fallback - use current defaults 

166 return LibraryRAGService( 

167 username=username, 

168 embedding_model=default_embedding_model, 

169 embedding_provider=default_embedding_provider, 

170 chunk_size=default_chunk_size, 

171 chunk_overlap=default_chunk_overlap, 

172 splitter_type=default_splitter_type, 

173 text_separators=default_text_separators, 

174 distance_metric=default_distance_metric, 

175 normalize_vectors=default_normalize_vectors, 

176 index_type=default_index_type, 

177 db_password=db_password, 

178 )