Coverage for src/local_deep_research/research_library/services/rag_service_factory.py: 100%

47 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2RAG Service Factory 

3 

4Provides get_rag_service() for creating LibraryRAGService instances 

5with appropriate settings. Extracted from rag_routes.py to avoid 

6circular imports (service → routes). 

7""" 

8 

9import json 

10from typing import Optional 

11 

12from loguru import logger 

13 

14from ...database.models.library import Collection 

15from ...database.session_context import get_user_db_session 

16from ...utilities.db_utils import get_settings_manager 

17from ...utilities.type_utils import to_bool 

18from ..services.library_rag_service import LibraryRAGService 

19 

20 

21def get_rag_service( 

22 username: str, 

23 collection_id: Optional[str] = None, 

24 use_defaults: bool = False, 

25 db_password: Optional[str] = None, 

26) -> LibraryRAGService: 

27 """ 

28 Get RAG service instance with appropriate settings. 

29 

30 Args: 

31 username: Username for database access and settings lookup 

32 collection_id: Optional collection UUID to load stored settings from 

33 use_defaults: When True, ignore stored collection settings and use 

34 current defaults. Pass True on force-reindex so that the new 

35 default embedding model is picked up. 

36 db_password: Optional database password for encrypted databases 

37 

38 If collection_id is provided: 

39 - Uses collection's stored settings if they exist (unless use_defaults=True) 

40 - Uses current defaults for new collections (and stores them) 

41 

42 If no collection_id: 

43 - Uses current default settings 

44 """ 

45 # Use get_user_db_session so that settings are readable from background 

46 # threads (no Flask app context). Without an explicit db_session, 

47 # get_settings_manager falls back to JSON defaults only, and the 

48 # local_search_* keys have no JSON defaults — causing user-configured 

49 # embedding settings to be silently ignored. See #3453. 

50 with get_user_db_session(username, db_password) as db_session: 

51 settings = get_settings_manager( 

52 db_session=db_session, username=username 

53 ) 

54 

55 # Get current default settings. 

56 # The local_search_* keys are written by the embedding-settings page 

57 # and have no JSON defaults file yet, so explicit fallbacks are 

58 # required to avoid TypeError / None propagation on fresh installs. 

59 raw_embedding_model = settings.get_setting( 

60 "local_search_embedding_model" 

61 ) 

62 raw_embedding_provider = settings.get_setting( 

63 "local_search_embedding_provider" 

64 ) 

65 # Warn on silent fallback so a regression of #3453 is visible in logs 

66 # instead of being masked by `or`-chained defaults. On fresh installs 

67 # this fires legitimately until the user saves settings; in a 

68 # regression it would fire on every indexing call. 

69 if not raw_embedding_model and not raw_embedding_provider: 

70 logger.warning( 

71 "local_search embedding settings are empty; falling back to " 

72 "hardcoded defaults (sentence_transformers/all-MiniLM-L6-v2). " 

73 "Expected on fresh installs before settings are saved; " 

74 "otherwise check that db_session is being passed to " 

75 "SettingsManager (see #3453)." 

76 ) 

77 default_embedding_model = raw_embedding_model or "all-MiniLM-L6-v2" 

78 default_embedding_provider = ( 

79 raw_embedding_provider or "sentence_transformers" 

80 ) 

81 default_chunk_size = int( 

82 settings.get_setting("local_search_chunk_size") or 1000 

83 ) 

84 default_chunk_overlap = int( 

85 settings.get_setting("local_search_chunk_overlap") or 200 

86 ) 

87 default_splitter_type = ( 

88 settings.get_setting("local_search_splitter_type") or "recursive" 

89 ) 

90 default_text_separators = settings.get_setting( 

91 "local_search_text_separators" 

92 ) 

93 # Parse JSON string to list 

94 if isinstance(default_text_separators, str): 

95 try: 

96 default_text_separators = json.loads(default_text_separators) 

97 except json.JSONDecodeError: 

98 logger.warning( 

99 "Invalid JSON for local_search_text_separators: {!r} — using default separators", 

100 default_text_separators, 

101 ) 

102 default_text_separators = ["\n\n", "\n", ". ", " ", ""] 

103 elif default_text_separators is None: 

104 default_text_separators = ["\n\n", "\n", ". ", " ", ""] 

105 default_distance_metric = ( 

106 settings.get_setting("local_search_distance_metric") or "cosine" 

107 ) 

108 default_normalize_vectors = settings.get_bool_setting( 

109 "local_search_normalize_vectors" 

110 ) 

111 default_index_type = ( 

112 settings.get_setting("local_search_index_type") or "flat" 

113 ) 

114 

115 # If collection_id provided, check for stored settings 

116 if collection_id: 

117 collection = ( 

118 db_session.query(Collection).filter_by(id=collection_id).first() 

119 ) 

120 

121 if collection and collection.embedding_model and not use_defaults: 

122 # Use collection's stored settings 

123 logger.info( 

124 f"Using stored settings for collection {collection_id}: " 

125 f"{collection.embedding_model_type.value if collection.embedding_model_type else 'unknown'}/{collection.embedding_model}" 

126 ) 

127 # Handle normalize_vectors - may be stored as string in some 

128 # cases 

129 coll_normalize = collection.normalize_vectors 

130 if coll_normalize is not None: 

131 coll_normalize = to_bool(coll_normalize) 

132 else: 

133 coll_normalize = default_normalize_vectors 

134 

135 def _col(stored, default): 

136 """Use stored collection value if not None, else default.""" 

137 return stored if stored is not None else default 

138 

139 return LibraryRAGService( 

140 username=username, 

141 embedding_model=collection.embedding_model, 

142 embedding_provider=collection.embedding_model_type.value 

143 if collection.embedding_model_type 

144 else default_embedding_provider, 

145 chunk_size=_col(collection.chunk_size, default_chunk_size), 

146 chunk_overlap=_col( 

147 collection.chunk_overlap, default_chunk_overlap 

148 ), 

149 splitter_type=_col( 

150 collection.splitter_type, default_splitter_type 

151 ), 

152 text_separators=_col( 

153 collection.text_separators, default_text_separators 

154 ), 

155 distance_metric=_col( 

156 collection.distance_metric, default_distance_metric 

157 ), 

158 normalize_vectors=coll_normalize, 

159 index_type=_col(collection.index_type, default_index_type), 

160 db_password=db_password, 

161 ) 

162 if collection: 

163 # New collection - use defaults and store them 

164 logger.info( 

165 f"New collection {collection_id}, using and storing default settings" 

166 ) 

167 

168 # Create service with defaults 

169 return LibraryRAGService( 

170 username=username, 

171 embedding_model=default_embedding_model, 

172 embedding_provider=default_embedding_provider, 

173 chunk_size=default_chunk_size, 

174 chunk_overlap=default_chunk_overlap, 

175 splitter_type=default_splitter_type, 

176 text_separators=default_text_separators, 

177 distance_metric=default_distance_metric, 

178 normalize_vectors=default_normalize_vectors, 

179 index_type=default_index_type, 

180 db_password=db_password, 

181 ) 

182 

183 # Store settings on collection (will be done during indexing) 

184 # Note: We don't store here because we don't have 

185 # embedding_dimension yet. It will be stored in 

186 # index_collection when first document is indexed. 

187 

188 # No collection or fallback - use current defaults 

189 return LibraryRAGService( 

190 username=username, 

191 embedding_model=default_embedding_model, 

192 embedding_provider=default_embedding_provider, 

193 chunk_size=default_chunk_size, 

194 chunk_overlap=default_chunk_overlap, 

195 splitter_type=default_splitter_type, 

196 text_separators=default_text_separators, 

197 distance_metric=default_distance_metric, 

198 normalize_vectors=default_normalize_vectors, 

199 index_type=default_index_type, 

200 db_password=db_password, 

201 )