maximhq · akshaydeo · Apr 28, 2026 · Apr 27, 2026 · coderabbitai · Apr 28, 2026
diff --git a/docs/features/semantic-caching.mdx b/docs/features/semantic-caching.mdx
@@ -118,7 +118,6 @@ import (
 cacheConfig := &semanticcache.Config{
     // Embedding model configuration (Required)
     Provider:       schemas.OpenAI,
-    Keys:          []schemas.Key{{Value: "sk-..."}},
     EmbeddingModel: "text-embedding-3-small",
     Dimension:     1536,
 
@@ -155,22 +154,32 @@ bifrostConfig := schemas.BifrostConfig{
 
 ![Semantic Cache Plugin Configuration](../media/ui-semantic-cache-config.png)
 
-**Note**: Make sure you have a vector store setup (using `config.json`) before configuring the semantic cache plugin.
+**Prerequisites**: A vector store must be configured and enabled in `config.json`, and at least one provider must be configured, before the toggle becomes available.
 
-1. **Navigate to Settings**
-   - Open Bifrost UI at `http://localhost:8080`
-   - Go to Settings.
+1. **Navigate to the Config page** in the Bifrost UI and find the **Plugins** section.
 
-2. **Configure Semantic Cache Plugin**
+2. **Toggle** the **Enable Semantic Caching** switch to enable it. The configuration form expands below.
 
-- Toggle the plugin switch to enable it, and fill in the required fields.
+3. **Fill in the fields** across the four sections:
 
-**Required Fields:**
-- **Provider**: The provider to use for caching.
-- **Embedding Model**: The embedding model to use for caching.
-- **Dimension**: The embedding dimension for the configured embedding model.
+**Provider and Model Settings** (required for semantic mode):
+- **Configured Providers**: Dropdown of providers already set up in Bifrost. The selected provider's API keys are inherited automatically.
+- **Embedding Model**: The embedding model to use (e.g. `text-embedding-3-small`).
 
-**Note**: Changes will need a restart of the Bifrost server to take effect, because the plugin is loaded on startup only.
+**Cache Settings**:
+- **TTL (seconds)**: How long cached responses are kept (default: 300 s).
+- **Similarity Threshold**: Cosine similarity cutoff for a cache hit (0–1, default: 0.8).
+- **Dimension**: Vector dimension matching your embedding model (e.g. 1536 for `text-embedding-3-small`).
+
+**Conversation Settings**:
+- **Conversation History Threshold**: Skip caching when the conversation has more than this many messages (default: 3).
+- **Exclude System Prompt** (toggle): Exclude system messages from cache-key generation.
+
+**Cache Behavior**:
+- **Cache by Model** (toggle): Include the model name in the cache key (default: on).
+- **Cache by Provider** (toggle): Include the provider name in the cache key (default: on).
+
+4. Click **Save**. Changes are persisted and applied immediately for enabled plugins via the API reload path; other plugin changes (e.g. via `config.json`) may still require a restart.
 
 </Tab>
 
@@ -202,7 +211,7 @@ bifrostConfig := schemas.BifrostConfig{
 }
 ```
 
-> **Note**: In `config.json` setups, provider keys are taken from the provider config on initialization, so you do not need to duplicate `keys` inside the plugin config. Any updates to the provider keys will not be reflected until next restart.
+> **Note**: Provider API keys are inherited automatically from the global provider configuration. You do not need to (and cannot) specify keys inside the plugin config.
 
 **TTL Format Options:**
 - Duration strings: `"30s"`, `"5m"`, `"1h"`, `"24h"`
@@ -228,7 +237,7 @@ Exact-match direct entries are stored and retrieved using a deterministic cache
 
 ### Setup
 
-To enable direct-only mode globally, set `dimension: 1` and omit the `provider` and `keys` fields from the plugin config. The plugin will automatically fall back to direct search only.
+To enable direct-only mode globally, set `dimension: 1` and omit the `provider` and `embedding_model` fields from the plugin config. The plugin will automatically fall back to direct search only.
 
 > **Important**: If you specify `dimension: 1` and also provide a `provider`, Bifrost treats the config as provider-backed semantic mode, not direct-only mode. To use direct-only mode, omit the `provider` field entirely.
 
@@ -246,7 +255,7 @@ import (
 )
 
 cacheConfig := &semanticcache.Config{
-    // No Provider, Keys, or EmbeddingModel -- direct hash mode only
+    // No Provider or EmbeddingModel -- direct hash mode only
     Dimension: 1, // Placeholder; entries are stored as metadata-only (no embedding vectors). Change dimension before switching to dual-layer mode to avoid mixed-dimension issues.
 
     TTL:               5 * time.Minute,

diff --git a/plugins/semanticcache/main.go b/plugins/semanticcache/main.go
@@ -15,7 +15,6 @@ import (
 
 	bifrost "github.com/maximhq/bifrost/core"
 	"github.com/maximhq/bifrost/core/schemas"
-	"github.com/maximhq/bifrost/framework"
 	"github.com/maximhq/bifrost/framework/vectorstore"
 )
 
@@ -25,7 +24,6 @@ import (
 type Config struct {
 	// Embedding Model settings - REQUIRED for semantic caching
 	Provider       schemas.ModelProvider `json:"provider"`
-	Keys           []schemas.Key         `json:"keys"`
 	EmbeddingModel string                `json:"embedding_model,omitempty"` // Model to use for generating embeddings (optional)
 
 	// Plugin behavior settings
@@ -48,19 +46,18 @@ type Config struct {
 func (c *Config) UnmarshalJSON(data []byte) error {
 	// Define a temporary struct to avoid infinite recursion
 	type TempConfig struct {
-		Provider                     string        `json:"provider"`
-		Keys                         []schemas.Key `json:"keys"`
-		EmbeddingModel               string        `json:"embedding_model,omitempty"`
-		CleanUpOnShutdown            bool          `json:"cleanup_on_shutdown,omitempty"`
-		Dimension                    int           `json:"dimension"`
-		TTL                          interface{}   `json:"ttl,omitempty"`
-		Threshold                    float64       `json:"threshold,omitempty"`
-		VectorStoreNamespace         string        `json:"vector_store_namespace,omitempty"`
-		DefaultCacheKey              string        `json:"default_cache_key,omitempty"`
-		ConversationHistoryThreshold int           `json:"conversation_history_threshold,omitempty"`
-		CacheByModel                 *bool         `json:"cache_by_model,omitempty"`
-		CacheByProvider              *bool         `json:"cache_by_provider,omitempty"`
-		ExcludeSystemPrompt          *bool         `json:"exclude_system_prompt,omitempty"`
+		Provider                     string      `json:"provider"`
+		EmbeddingModel               string      `json:"embedding_model,omitempty"`
+		CleanUpOnShutdown            bool        `json:"cleanup_on_shutdown,omitempty"`
+		Dimension                    int         `json:"dimension"`
+		TTL                          interface{} `json:"ttl,omitempty"`
+		Threshold                    float64     `json:"threshold,omitempty"`
+		VectorStoreNamespace         string      `json:"vector_store_namespace,omitempty"`
+		DefaultCacheKey              string      `json:"default_cache_key,omitempty"`
+		ConversationHistoryThreshold int         `json:"conversation_history_threshold,omitempty"`
+		CacheByModel                 *bool       `json:"cache_by_model,omitempty"`
+		CacheByProvider              *bool       `json:"cache_by_provider,omitempty"`
+		ExcludeSystemPrompt          *bool       `json:"exclude_system_prompt,omitempty"`
 	}
 
 	var temp TempConfig
@@ -70,7 +67,6 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 
 	// Set simple fields
 	c.Provider = schemas.ModelProvider(temp.Provider)
-	c.Keys = temp.Keys
 	c.EmbeddingModel = temp.EmbeddingModel
 	c.CleanUpOnShutdown = temp.CleanUpOnShutdown
 	c.Dimension = temp.Dimension
@@ -129,6 +125,10 @@ type StreamAccumulator struct {
 	mu             sync.Mutex     // Protects chunk operations
 }
 
+// EmbeddingRequestExecutor is a function that executes a request and returns a response and an error.
+// It maps to .EmbeddingRequest() of the bifrost client.
+type EmbeddingRequestExecutor func(ctx *schemas.BifrostContext, req *schemas.BifrostEmbeddingRequest) (*schemas.BifrostEmbeddingResponse, *schemas.BifrostError)
+
 // Plugin implements the schemas.LLMPlugin interface for semantic caching.
 // It caches responses using a two-tier approach: direct hash matching for exact requests
 // and semantic similarity search for related content. The plugin supports configurable caching behavior
@@ -139,12 +139,12 @@ type StreamAccumulator struct {
 //   - config: Plugin configuration including semantic cache and caching settings
 //   - logger: Logger instance for plugin operations
 type Plugin struct {
-	store              vectorstore.VectorStore
-	config             *Config
-	logger             schemas.Logger
-	client             *bifrost.Bifrost
-	streamAccumulators sync.Map // Track stream accumulators by request ID
-	waitGroup          sync.WaitGroup
+	store                    vectorstore.VectorStore
+	config                   *Config
+	logger                   schemas.Logger
+	embeddingRequestExecutor EmbeddingRequestExecutor
+	streamAccumulators       sync.Map // Track stream accumulators by request ID
+	waitGroup                sync.WaitGroup
 }
 
 // Plugin constants
@@ -201,45 +201,6 @@ var VectorStoreProperties = map[string]vectorstore.VectorStoreProperties{
 	},
 }
 
-type PluginAccount struct {
-	provider schemas.ModelProvider
-	keys     []schemas.Key
-}
-
-func (pa *PluginAccount) GetConfiguredProviders() ([]schemas.ModelProvider, error) {
-	return []schemas.ModelProvider{pa.provider}, nil
-}
-
-func (pa *PluginAccount) GetKeysForProvider(ctx context.Context, providerKey schemas.ModelProvider) ([]schemas.Key, error) {
-	return pa.keys, nil
-}
-
-func (pa *PluginAccount) GetConfigForProvider(providerKey schemas.ModelProvider) (*schemas.ProviderConfig, error) {
-	return &schemas.ProviderConfig{
-		NetworkConfig:            schemas.DefaultNetworkConfig,
-		ConcurrencyAndBufferSize: schemas.DefaultConcurrencyAndBufferSize,
-	}, nil
-}
-
-// Dependencies is a list of dependencies that the plugin requires.
-var Dependencies []framework.FrameworkDependency = []framework.FrameworkDependency{framework.FrameworkDependencyVectorStore}
-
-// ProvidersWithEmbeddingSupport lists all providers that support embedding operations.
-// Providers not in this list will return UnsupportedOperationError for embedding requests.
-var ProvidersWithEmbeddingSupport = map[schemas.ModelProvider]bool{
-	schemas.OpenAI:      true,
-	schemas.Azure:       true,
-	schemas.Bedrock:     true,
-	schemas.Cohere:      true,
-	schemas.Gemini:      true,
-	schemas.Vertex:      true,
-	schemas.Mistral:     true,
-	schemas.Ollama:      true,
-	schemas.Nebius:      true,
-	schemas.HuggingFace: true,
-	schemas.SGL:         true,
-}
-
 const (
 	CacheKey          schemas.BifrostContextKey = "semantic_cache_key"        // To set the cache key for a request - REQUIRED for all requests
 	CacheTTLKey       schemas.BifrostContextKey = "semantic_cache_ttl"        // To explicitly set the TTL for a request
@@ -323,26 +284,8 @@ func Init(ctx context.Context, config *Config, logger schemas.Logger, store vect
 
 	if config.Provider == "" && config.Dimension == 1 {
 		logger.Info(PluginLoggerPrefix + " Starting in direct-only mode (dimension=1, no embedding provider)")
-	} else if config.Provider == "" || len(config.Keys) == 0 {
-		logger.Warn(PluginLoggerPrefix + " Incomplete semantic mode config: missing provider or keys, falling back to direct search only")
-	} else {
-		// Validate that the provider supports embeddings
-		if bifrost.IsStandardProvider(config.Provider) && !ProvidersWithEmbeddingSupport[config.Provider] {
-			return nil, fmt.Errorf("provider '%s' does not support embedding operations required for semantic cache. Supported providers: openai, azure, bedrock, cohere, gemini, vertex, mistral, ollama, nebius, huggingface, sgl. Note: custom providers based on embedding-capable providers are also supported", config.Provider)
-		}
-
-		bifrost, err := bifrost.Init(ctx, schemas.BifrostConfig{
-			Logger: logger,
-			Account: &PluginAccount{
-				provider: config.Provider,
-				keys:     config.Keys,
-			},
-		})
-		if err != nil {
-			return nil, fmt.Errorf("failed to initialize bifrost for semantic cache: %w", err)
-		}
-
-		plugin.client = bifrost
+	} else if config.Provider == "" {
+		logger.Warn(PluginLoggerPrefix + " Incomplete semantic mode config: missing provider, falling back to direct search only")
 	}
 
 	createCtx, cancel := context.WithTimeout(ctx, CreateNamespaceTimeout)
@@ -378,19 +321,6 @@ func (plugin *Plugin) HTTPTransportStreamChunkHook(ctx *schemas.BifrostContext,
 	return chunk, nil
 }
 
-func (plugin *Plugin) clearRequestScopedContext(ctx *schemas.BifrostContext) {
-	ctx.ClearValue(requestIDKey)
-	ctx.ClearValue(requestStorageIDKey)
-	ctx.ClearValue(requestHashKey)
-	ctx.ClearValue(requestParamsHashKey)
-	ctx.ClearValue(requestModelKey)
-	ctx.ClearValue(requestProviderKey)
-	ctx.ClearValue(requestEmbeddingKey)
-	ctx.ClearValue(requestEmbeddingTokensKey)
-	ctx.ClearValue(isCacheHitKey)
-	ctx.ClearValue(cacheHitTypeKey)
-}
-
 // PreLLMHook is called before a request is processed by Bifrost.
 // It performs a two-stage cache lookup: first direct hash matching, then semantic similarity search.
 // Uses UUID-based keys for entries stored in the VectorStore.
@@ -465,7 +395,7 @@ func (plugin *Plugin) PreLLMHook(ctx *schemas.BifrostContext, req *schemas.Bifro
 		}
 	}
 
-	if performSemanticSearch && plugin.client != nil {
+	if performSemanticSearch && plugin.embeddingRequestExecutor != nil {
 		if req.EmbeddingRequest != nil || req.TranscriptionRequest != nil {
 			plugin.logger.Debug(PluginLoggerPrefix + " Skipping semantic search for embedding/transcription input")
 			// For vector stores that require vectors, set a zero vector placeholder
@@ -488,7 +418,7 @@ func (plugin *Plugin) PreLLMHook(ctx *schemas.BifrostContext, req *schemas.Bifro
 		if shortCircuit != nil {
 			return req, shortCircuit, nil
 		}
-	} else if !performSemanticSearch && plugin.store.RequiresVectors() && plugin.client != nil {
+	} else if !performSemanticSearch && plugin.store.RequiresVectors() && plugin.embeddingRequestExecutor != nil {
 		// Vector store requires vectors but we're in direct-only mode
 		// Generate embeddings for storage purposes (not for searching)
 		if req.EmbeddingRequest != nil || req.TranscriptionRequest != nil {
@@ -759,11 +689,6 @@ func (plugin *Plugin) Cleanup() error {
 	// Clean up old stream accumulators first
 	plugin.cleanupOldStreamAccumulators()
 
-	// Shutdown the internal Bifrost client used for embeddings
-	if plugin.client != nil {
-		plugin.client.Shutdown()
-	}
-
 	// Only clean up cache entries if configured to do so
 	if !plugin.config.CleanUpOnShutdown {
 		plugin.logger.Debug(PluginLoggerPrefix + " Cleanup on shutdown is disabled, skipping cache cleanup")
@@ -804,6 +729,15 @@ func (plugin *Plugin) Cleanup() error {
 	return nil
 }
 
+// SetEmbeddingRequestExecutor sets the embedding request executor for the plugin.
+// Needs to be set before the plugin is used.
+//
+// Parameters:
+//   - executor: The embedding request executor to set
+func (plugin *Plugin) SetEmbeddingRequestExecutor(executor EmbeddingRequestExecutor) {
+	plugin.embeddingRequestExecutor = executor
+}
+
 // Public Methods for External Use
 
 // ClearCacheForKey deletes cache entries for a specific cache key.
@@ -869,3 +803,16 @@ func (plugin *Plugin) ClearCacheForRequestID(requestID string) error {
 
 	return nil
 }
+
+func (plugin *Plugin) clearRequestScopedContext(ctx *schemas.BifrostContext) {
+	ctx.ClearValue(requestIDKey)
+	ctx.ClearValue(requestStorageIDKey)
+	ctx.ClearValue(requestHashKey)
+	ctx.ClearValue(requestParamsHashKey)
+	ctx.ClearValue(requestModelKey)
+	ctx.ClearValue(requestProviderKey)
+	ctx.ClearValue(requestEmbeddingKey)
+	ctx.ClearValue(requestEmbeddingTokensKey)
+	ctx.ClearValue(isCacheHitKey)
+	ctx.ClearValue(cacheHitTypeKey)
+}