Coverage for src/local_deep_research/research_library/services/rag_service_factory.py: 100%
47 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2RAG Service Factory
4Provides get_rag_service() for creating LibraryRAGService instances
5with appropriate settings. Extracted from rag_routes.py to avoid
6circular imports (service → routes).
7"""
9import json
10from typing import Optional
12from loguru import logger
14from ...database.models.library import Collection
15from ...database.session_context import get_user_db_session
16from ...utilities.db_utils import get_settings_manager
17from ...utilities.type_utils import to_bool
18from ..services.library_rag_service import LibraryRAGService
21def get_rag_service(
22 username: str,
23 collection_id: Optional[str] = None,
24 use_defaults: bool = False,
25 db_password: Optional[str] = None,
26) -> LibraryRAGService:
27 """
28 Get RAG service instance with appropriate settings.
30 Args:
31 username: Username for database access and settings lookup
32 collection_id: Optional collection UUID to load stored settings from
33 use_defaults: When True, ignore stored collection settings and use
34 current defaults. Pass True on force-reindex so that the new
35 default embedding model is picked up.
36 db_password: Optional database password for encrypted databases
38 If collection_id is provided:
39 - Uses collection's stored settings if they exist (unless use_defaults=True)
40 - Uses current defaults for new collections (and stores them)
42 If no collection_id:
43 - Uses current default settings
44 """
45 # Use get_user_db_session so that settings are readable from background
46 # threads (no Flask app context). Without an explicit db_session,
47 # get_settings_manager falls back to JSON defaults only, and the
48 # local_search_* keys have no JSON defaults — causing user-configured
49 # embedding settings to be silently ignored. See #3453.
50 with get_user_db_session(username, db_password) as db_session:
51 settings = get_settings_manager(
52 db_session=db_session, username=username
53 )
55 # Get current default settings.
56 # The local_search_* keys are written by the embedding-settings page
57 # and have no JSON defaults file yet, so explicit fallbacks are
58 # required to avoid TypeError / None propagation on fresh installs.
59 raw_embedding_model = settings.get_setting(
60 "local_search_embedding_model"
61 )
62 raw_embedding_provider = settings.get_setting(
63 "local_search_embedding_provider"
64 )
65 # Warn on silent fallback so a regression of #3453 is visible in logs
66 # instead of being masked by `or`-chained defaults. On fresh installs
67 # this fires legitimately until the user saves settings; in a
68 # regression it would fire on every indexing call.
69 if not raw_embedding_model and not raw_embedding_provider:
70 logger.warning(
71 "local_search embedding settings are empty; falling back to "
72 "hardcoded defaults (sentence_transformers/all-MiniLM-L6-v2). "
73 "Expected on fresh installs before settings are saved; "
74 "otherwise check that db_session is being passed to "
75 "SettingsManager (see #3453)."
76 )
77 default_embedding_model = raw_embedding_model or "all-MiniLM-L6-v2"
78 default_embedding_provider = (
79 raw_embedding_provider or "sentence_transformers"
80 )
81 default_chunk_size = int(
82 settings.get_setting("local_search_chunk_size") or 1000
83 )
84 default_chunk_overlap = int(
85 settings.get_setting("local_search_chunk_overlap") or 200
86 )
87 default_splitter_type = (
88 settings.get_setting("local_search_splitter_type") or "recursive"
89 )
90 default_text_separators = settings.get_setting(
91 "local_search_text_separators"
92 )
93 # Parse JSON string to list
94 if isinstance(default_text_separators, str):
95 try:
96 default_text_separators = json.loads(default_text_separators)
97 except json.JSONDecodeError:
98 logger.warning(
99 "Invalid JSON for local_search_text_separators: {!r} — using default separators",
100 default_text_separators,
101 )
102 default_text_separators = ["\n\n", "\n", ". ", " ", ""]
103 elif default_text_separators is None:
104 default_text_separators = ["\n\n", "\n", ". ", " ", ""]
105 default_distance_metric = (
106 settings.get_setting("local_search_distance_metric") or "cosine"
107 )
108 default_normalize_vectors = settings.get_bool_setting(
109 "local_search_normalize_vectors"
110 )
111 default_index_type = (
112 settings.get_setting("local_search_index_type") or "flat"
113 )
115 # If collection_id provided, check for stored settings
116 if collection_id:
117 collection = (
118 db_session.query(Collection).filter_by(id=collection_id).first()
119 )
121 if collection and collection.embedding_model and not use_defaults:
122 # Use collection's stored settings
123 logger.info(
124 f"Using stored settings for collection {collection_id}: "
125 f"{collection.embedding_model_type.value if collection.embedding_model_type else 'unknown'}/{collection.embedding_model}"
126 )
127 # Handle normalize_vectors - may be stored as string in some
128 # cases
129 coll_normalize = collection.normalize_vectors
130 if coll_normalize is not None:
131 coll_normalize = to_bool(coll_normalize)
132 else:
133 coll_normalize = default_normalize_vectors
135 def _col(stored, default):
136 """Use stored collection value if not None, else default."""
137 return stored if stored is not None else default
139 return LibraryRAGService(
140 username=username,
141 embedding_model=collection.embedding_model,
142 embedding_provider=collection.embedding_model_type.value
143 if collection.embedding_model_type
144 else default_embedding_provider,
145 chunk_size=_col(collection.chunk_size, default_chunk_size),
146 chunk_overlap=_col(
147 collection.chunk_overlap, default_chunk_overlap
148 ),
149 splitter_type=_col(
150 collection.splitter_type, default_splitter_type
151 ),
152 text_separators=_col(
153 collection.text_separators, default_text_separators
154 ),
155 distance_metric=_col(
156 collection.distance_metric, default_distance_metric
157 ),
158 normalize_vectors=coll_normalize,
159 index_type=_col(collection.index_type, default_index_type),
160 db_password=db_password,
161 )
162 if collection:
163 # New collection - use defaults and store them
164 logger.info(
165 f"New collection {collection_id}, using and storing default settings"
166 )
168 # Create service with defaults
169 return LibraryRAGService(
170 username=username,
171 embedding_model=default_embedding_model,
172 embedding_provider=default_embedding_provider,
173 chunk_size=default_chunk_size,
174 chunk_overlap=default_chunk_overlap,
175 splitter_type=default_splitter_type,
176 text_separators=default_text_separators,
177 distance_metric=default_distance_metric,
178 normalize_vectors=default_normalize_vectors,
179 index_type=default_index_type,
180 db_password=db_password,
181 )
183 # Store settings on collection (will be done during indexing)
184 # Note: We don't store here because we don't have
185 # embedding_dimension yet. It will be stored in
186 # index_collection when first document is indexed.
188 # No collection or fallback - use current defaults
189 return LibraryRAGService(
190 username=username,
191 embedding_model=default_embedding_model,
192 embedding_provider=default_embedding_provider,
193 chunk_size=default_chunk_size,
194 chunk_overlap=default_chunk_overlap,
195 splitter_type=default_splitter_type,
196 text_separators=default_text_separators,
197 distance_metric=default_distance_metric,
198 normalize_vectors=default_normalize_vectors,
199 index_type=default_index_type,
200 db_password=db_password,
201 )