Coverage for src/local_deep_research/embeddings/providers/implementations/openai.py: 98%

81 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""OpenAI embedding provider.""" 

2 

3from typing import Any, Dict, List, Optional 

4from urllib.parse import urlparse 

5 

6from langchain_core.embeddings import Embeddings 

7from loguru import logger 

8 

9from ....config.thread_settings import get_setting_from_snapshot 

10from ....utilities.url_utils import normalize_url 

11from ..base import BaseEmbeddingProvider 

12 

13 

14class OpenAIEmbeddingsProvider(BaseEmbeddingProvider): 

15 """ 

16 OpenAI embedding provider. 

17 

18 Targets the OpenAI cloud API by default, and any OpenAI-compatible 

19 endpoint (LM Studio, vLLM, llama.cpp server, etc.) when 

20 ``embeddings.openai.base_url`` is configured. An API key is required 

21 for the cloud, but optional for keyless local servers — the 

22 ``base_url``-set, ``api_key``-empty configuration falls back to a 

23 placeholder key so the OpenAI client request still goes out. 

24 """ 

25 

26 provider_name = "OpenAI" 

27 provider_key = "OPENAI" 

28 # Not strictly required: the OpenAI cloud needs a key, but 

29 # OpenAI-compatible local servers (LM Studio, vLLM, llama.cpp) 

30 # don't. ``is_available`` and ``create_embeddings`` enforce the 

31 # cloud-needs-key rule at runtime when no base_url is set. 

32 # Inherits ``requires_api_key = False`` from BaseEmbeddingProvider. 

33 supports_local = False 

34 default_model = "text-embedding-3-small" # type: ignore[assignment] 

35 # Placeholder key used when targeting an OpenAI-compatible local 

36 # server (api_key empty, base_url set). Mirrors the LLM-side 

37 # LMStudio provider's keyless-fallback pattern. 

38 _PLACEHOLDER_API_KEY = "lm-studio" 

39 

40 @classmethod 

41 def create_embeddings( 

42 cls, 

43 model: Optional[str] = None, 

44 settings_snapshot: Optional[Dict[str, Any]] = None, 

45 **kwargs, 

46 ) -> Embeddings: 

47 """ 

48 Create OpenAI embeddings instance. 

49 

50 Args: 

51 model: Model name (defaults to text-embedding-3-small) 

52 settings_snapshot: Optional settings snapshot 

53 **kwargs: Additional parameters (api_key, etc.) 

54 

55 Returns: 

56 OpenAIEmbeddings instance 

57 

58 Raises: 

59 ValueError: If API key is not configured 

60 """ 

61 from langchain_openai import OpenAIEmbeddings 

62 

63 # Get API key + base_url. Read base_url first so we can decide 

64 # whether a missing api_key is fatal (cloud) or just a keyless 

65 # local-server signal (OpenAI-compatible endpoint). 

66 base_url = kwargs.get("base_url") 

67 if base_url is None: 

68 base_url = get_setting_from_snapshot( 

69 "embeddings.openai.base_url", 

70 default=None, 

71 settings_snapshot=settings_snapshot, 

72 ) 

73 

74 api_key = kwargs.get("api_key") 

75 if api_key is None: 

76 api_key = get_setting_from_snapshot( 

77 "embeddings.openai.api_key", 

78 default=None, 

79 settings_snapshot=settings_snapshot, 

80 ) 

81 

82 if not api_key: 

83 if base_url: 

84 # OpenAI-compatible local server (LM Studio, vLLM, 

85 # llama.cpp). The server ignores the key but the 

86 # OpenAI client requires the field to be non-empty. 

87 logger.info( 

88 "OpenAI embeddings: no API key set but base_url={} " 

89 "is configured — using placeholder key for the " 

90 "OpenAI-compatible endpoint.", 

91 base_url, 

92 ) 

93 api_key = cls._PLACEHOLDER_API_KEY 

94 else: 

95 logger.error("OpenAI API key not found in settings") 

96 raise ValueError( 

97 "OpenAI API key not configured. " 

98 "Please set embeddings.openai.api_key in settings, " 

99 "or set embeddings.openai.base_url to point at an " 

100 "OpenAI-compatible local server." 

101 ) 

102 

103 # Get model from settings if not specified 

104 if model is None: 

105 model = get_setting_from_snapshot( 

106 "embeddings.openai.model", 

107 default=cls.default_model, 

108 settings_snapshot=settings_snapshot, 

109 ) 

110 

111 dimensions = kwargs.get("dimensions") 

112 if dimensions is None: 

113 dimensions = get_setting_from_snapshot( 

114 "embeddings.openai.dimensions", 

115 default=None, 

116 settings_snapshot=settings_snapshot, 

117 ) 

118 

119 logger.info(f"Creating OpenAIEmbeddings with model={model}") 

120 

121 # Build parameters. Annotated as Dict[str, Any] so the 

122 # heterogeneous values (str for model/key/base_url, int for 

123 # dimensions) and the **params unpack into OpenAIEmbeddings 

124 # type-check under mypy. 

125 params: Dict[str, Any] = { 

126 "model": model, 

127 "openai_api_key": api_key, 

128 } 

129 

130 if base_url: 

131 # Normalize first so a scheme-less entry like "api.openai.com" 

132 # parses to a hostname (urlparse otherwise returns hostname=None 

133 # for bare hosts, which would silently drop the ctx-length guard 

134 # for the real OpenAI endpoint). Mirrors the LLM-side OpenAI 

135 # provider, which already normalizes via the same helper. 

136 base_url = normalize_url(base_url) 

137 params["openai_api_base"] = base_url 

138 # Disable client-side context length checks only for non-OpenAI 

139 # hosts (LM Studio, vLLM, llama.cpp, etc.) which may lack tiktoken 

140 # model entries or reject tokenized inputs. Keep the LangChain 

141 # default for api.openai.com so the guard stays in place for users 

142 # who set base_url explicitly to the real OpenAI endpoint. 

143 if urlparse(base_url).hostname != "api.openai.com": 

144 params["check_embedding_ctx_length"] = False 

145 

146 # For text-embedding-3 models, dimensions can be customized 

147 if dimensions and model.startswith("text-embedding-3"): 

148 params["dimensions"] = int(dimensions) 

149 

150 return OpenAIEmbeddings(**params) 

151 

152 @classmethod 

153 def is_available( 

154 cls, settings_snapshot: Optional[Dict[str, Any]] = None 

155 ) -> bool: 

156 """Check if OpenAI embeddings are available. 

157 

158 Available when either an API key (cloud) or a custom base URL 

159 (OpenAI-compatible local server) is configured. A blank 

160 installation still reports unavailable so the UI doesn't list 

161 the provider on first launch. 

162 """ 

163 try: 

164 api_key = get_setting_from_snapshot( 

165 "embeddings.openai.api_key", 

166 default=None, 

167 settings_snapshot=settings_snapshot, 

168 ) 

169 if api_key and str(api_key).strip(): 

170 return True 

171 base_url = get_setting_from_snapshot( 

172 "embeddings.openai.base_url", 

173 default=None, 

174 settings_snapshot=settings_snapshot, 

175 ) 

176 return bool(base_url and str(base_url).strip()) 

177 except Exception: 

178 logger.debug( 

179 "Error checking OpenAI embedding availability", exc_info=True 

180 ) 

181 return False 

182 

183 @classmethod 

184 def get_available_models( 

185 cls, settings_snapshot: Optional[Dict[str, Any]] = None 

186 ) -> List[Dict[str, Any]]: 

187 """Get every model the configured endpoint reports. 

188 

189 No filtering: ``/v1/models`` doesn't expose a reliable "is this 

190 an embedding model?" signal — neither cloud OpenAI nor 

191 OpenAI-compatible local servers (LM Studio, vLLM, llama.cpp). 

192 Earlier versions guessed from the model name and ended up 

193 hiding real embedding models whose names didn't match the 

194 heuristic (e.g. ``nomic-embed-text-v1.5`` was dropped because 

195 it lacks the trailing ``-ing``). The dropdown now shows every 

196 model the endpoint returns so the user can pick the one they 

197 actually loaded. 

198 """ 

199 try: 

200 from openai import OpenAI 

201 

202 api_key = get_setting_from_snapshot( 

203 "embeddings.openai.api_key", 

204 default=None, 

205 settings_snapshot=settings_snapshot, 

206 ) 

207 base_url = get_setting_from_snapshot( 

208 "embeddings.openai.base_url", 

209 default=None, 

210 settings_snapshot=settings_snapshot, 

211 ) 

212 

213 if not api_key: 

214 if base_url: 

215 # Keyless OpenAI-compatible local server — use a 

216 # placeholder so the client request can proceed. 

217 api_key = cls._PLACEHOLDER_API_KEY 

218 else: 

219 logger.warning("OpenAI API key not configured") 

220 return [] 

221 

222 client_kwargs: Dict[str, Any] = {"api_key": api_key} 

223 if base_url: 

224 client_kwargs["base_url"] = normalize_url(base_url) 

225 client = OpenAI(**client_kwargs) 

226 models_response = client.models.list() 

227 

228 # No name-based filtering — see method docstring (#4195). 

229 models: List[Dict[str, Any]] = [] 

230 for model in models_response.data: 

231 model_id = model.id 

232 # Skip only blank ids (malformed entry); never skip 

233 # based on what the name looks like. 

234 if not model_id: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 continue 

236 models.append({"value": model_id, "label": model_id}) 

237 

238 logger.info( 

239 "Fetched {} models from OpenAI endpoint{}", 

240 len(models), 

241 f" at {base_url}" if base_url else "", 

242 ) 

243 return models 

244 

245 except Exception: 

246 logger.exception("Error fetching OpenAI embedding models") 

247 return []