Coverage for src / local_deep_research / research_library / services / rag_service_factory.py: 100%
43 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2RAG Service Factory
4Provides get_rag_service() for creating LibraryRAGService instances
5with appropriate settings. Extracted from rag_routes.py to avoid
6circular imports (service → routes).
7"""
9import json
10from typing import Optional
12from loguru import logger
14from ...database.models.library import Collection
15from ...database.session_context import get_user_db_session
16from ...utilities.db_utils import get_settings_manager
17from ...utilities.type_utils import to_bool
18from ..services.library_rag_service import LibraryRAGService
21def get_rag_service(
22 username: str,
23 collection_id: Optional[str] = None,
24 use_defaults: bool = False,
25 db_password: Optional[str] = None,
26) -> LibraryRAGService:
27 """
28 Get RAG service instance with appropriate settings.
30 Args:
31 username: Username for database access and settings lookup
32 collection_id: Optional collection UUID to load stored settings from
33 use_defaults: When True, ignore stored collection settings and use
34 current defaults. Pass True on force-reindex so that the new
35 default embedding model is picked up.
36 db_password: Optional database password for encrypted databases
38 If collection_id is provided:
39 - Uses collection's stored settings if they exist (unless use_defaults=True)
40 - Uses current defaults for new collections (and stores them)
42 If no collection_id:
43 - Uses current default settings
44 """
45 settings = get_settings_manager(username=username)
47 # Get current default settings.
48 # The local_search_* keys are written by the embedding-settings page and
49 # have no JSON defaults file yet, so explicit fallbacks are required to
50 # avoid TypeError / None propagation on fresh installs.
51 default_embedding_model = (
52 settings.get_setting("local_search_embedding_model")
53 or "all-MiniLM-L6-v2"
54 )
55 default_embedding_provider = (
56 settings.get_setting("local_search_embedding_provider")
57 or "sentence_transformers"
58 )
59 default_chunk_size = int(
60 settings.get_setting("local_search_chunk_size") or 1000
61 )
62 default_chunk_overlap = int(
63 settings.get_setting("local_search_chunk_overlap") or 200
64 )
65 default_splitter_type = (
66 settings.get_setting("local_search_splitter_type") or "recursive"
67 )
68 default_text_separators = settings.get_setting(
69 "local_search_text_separators"
70 )
71 # Parse JSON string to list
72 if isinstance(default_text_separators, str):
73 try:
74 default_text_separators = json.loads(default_text_separators)
75 except json.JSONDecodeError:
76 logger.warning(
77 "Invalid JSON for local_search_text_separators: {!r} — using default separators",
78 default_text_separators,
79 )
80 default_text_separators = ["\n\n", "\n", ". ", " ", ""]
81 elif default_text_separators is None:
82 default_text_separators = ["\n\n", "\n", ". ", " ", ""]
83 default_distance_metric = (
84 settings.get_setting("local_search_distance_metric") or "cosine"
85 )
86 default_normalize_vectors = settings.get_bool_setting(
87 "local_search_normalize_vectors"
88 )
89 default_index_type = (
90 settings.get_setting("local_search_index_type") or "Flat"
91 )
93 # If collection_id provided, check for stored settings
94 if collection_id:
95 with get_user_db_session(username, db_password) as db_session:
96 collection = (
97 db_session.query(Collection).filter_by(id=collection_id).first()
98 )
100 if collection and collection.embedding_model and not use_defaults:
101 # Use collection's stored settings
102 logger.info(
103 f"Using stored settings for collection {collection_id}: "
104 f"{collection.embedding_model_type.value if collection.embedding_model_type else 'unknown'}/{collection.embedding_model}"
105 )
106 # Handle normalize_vectors - may be stored as string in some cases
107 coll_normalize = collection.normalize_vectors
108 if coll_normalize is not None:
109 coll_normalize = to_bool(coll_normalize)
110 else:
111 coll_normalize = default_normalize_vectors
113 def _col(stored, default):
114 """Use stored collection value if not None, else default."""
115 return stored if stored is not None else default
117 return LibraryRAGService(
118 username=username,
119 embedding_model=collection.embedding_model,
120 embedding_provider=collection.embedding_model_type.value
121 if collection.embedding_model_type
122 else default_embedding_provider,
123 chunk_size=_col(collection.chunk_size, default_chunk_size),
124 chunk_overlap=_col(
125 collection.chunk_overlap, default_chunk_overlap
126 ),
127 splitter_type=_col(
128 collection.splitter_type, default_splitter_type
129 ),
130 text_separators=_col(
131 collection.text_separators, default_text_separators
132 ),
133 distance_metric=_col(
134 collection.distance_metric, default_distance_metric
135 ),
136 normalize_vectors=coll_normalize,
137 index_type=_col(collection.index_type, default_index_type),
138 db_password=db_password,
139 )
140 if collection:
141 # New collection - use defaults and store them
142 logger.info(
143 f"New collection {collection_id}, using and storing default settings"
144 )
146 # Create service with defaults
147 return LibraryRAGService(
148 username=username,
149 embedding_model=default_embedding_model,
150 embedding_provider=default_embedding_provider,
151 chunk_size=default_chunk_size,
152 chunk_overlap=default_chunk_overlap,
153 splitter_type=default_splitter_type,
154 text_separators=default_text_separators,
155 distance_metric=default_distance_metric,
156 normalize_vectors=default_normalize_vectors,
157 index_type=default_index_type,
158 db_password=db_password,
159 )
161 # Store settings on collection (will be done during indexing)
162 # Note: We don't store here because we don't have embedding_dimension yet
163 # It will be stored in index_collection when first document is indexed
165 # No collection or fallback - use current defaults
166 return LibraryRAGService(
167 username=username,
168 embedding_model=default_embedding_model,
169 embedding_provider=default_embedding_provider,
170 chunk_size=default_chunk_size,
171 chunk_overlap=default_chunk_overlap,
172 splitter_type=default_splitter_type,
173 text_separators=default_text_separators,
174 distance_metric=default_distance_metric,
175 normalize_vectors=default_normalize_vectors,
176 index_type=default_index_type,
177 db_password=db_password,
178 )