From d6d6c95b66fc366663911b42bf09620b996d3a2f Mon Sep 17 00:00:00 2001 From: hzt <3061613175@qq.com> Date: Fri, 20 Feb 2026 13:07:20 +0800 Subject: [PATCH] feat(semantic-cache): support configurable vector dimensions for Qdrant Add vector_size parameter to QdrantSemanticCache and expose it through the Cache facade as qdrant_semantic_cache_vector_size. This allows users to use embedding models with dimensions other than the default 1536, enabling cheaper/stronger models like Stella (1024d), bge-en-icl (4096d), voyage, cohere, etc. The parameter defaults to QDRANT_VECTOR_SIZE (env var or 1536) for backward compatibility. When creating new collections, the configured vector_size is used instead of the hardcoded constant. Closes #9377 --- docs/my-website/docs/caching/all_caches.md | 2 + docs/my-website/docs/proxy/caching.md | 1 + docs/my-website/docs/proxy/config_settings.md | 1 + litellm/caching/caching.py | 2 + litellm/caching/qdrant_semantic_cache.py | 4 +- .../caching/test_qdrant_semantic_cache.py | 129 +++++++++++++++++- 6 files changed, 137 insertions(+), 2 deletions(-) diff --git a/docs/my-website/docs/caching/all_caches.md b/docs/my-website/docs/caching/all_caches.md index 37fb8bc360a2..6f81da9105aa 100644 --- a/docs/my-website/docs/caching/all_caches.md +++ b/docs/my-website/docs/caching/all_caches.md @@ -297,6 +297,7 @@ litellm.cache = Cache( similarity_threshold=0.7, # similarity threshold for cache hits, 0 == no similarity, 1 = exact matches, 0.5 == 50% similarity qdrant_quantization_config ="binary", # can be one of 'binary', 'product' or 'scalar' quantizations that is supported by qdrant qdrant_semantic_cache_embedding_model="text-embedding-ada-002", # this model is passed to litellm.embedding(), any litellm.embedding() model is supported here + qdrant_semantic_cache_vector_size=1536, # vector size for the embedding model, must match the dimensionality of the embedding model used ) response1 = completion( @@ -635,6 +636,7 @@ def __init__( qdrant_quantization_config: Optional[str] = None, qdrant_semantic_cache_embedding_model="text-embedding-ada-002", + qdrant_semantic_cache_vector_size: Optional[int] = None, **kwargs ): ``` diff --git a/docs/my-website/docs/proxy/caching.md b/docs/my-website/docs/proxy/caching.md index 3cb9e9f3fe43..3357dcb28b27 100644 --- a/docs/my-website/docs/proxy/caching.md +++ b/docs/my-website/docs/proxy/caching.md @@ -340,6 +340,7 @@ litellm_settings: qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list qdrant_collection_name: test_collection qdrant_quantization_config: binary + qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality similarity_threshold: 0.8 # similarity threshold for semantic cache ``` diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md index 9e3b5e909781..121d16241396 100644 --- a/docs/my-website/docs/proxy/config_settings.md +++ b/docs/my-website/docs/proxy/config_settings.md @@ -73,6 +73,7 @@ litellm_settings: qdrant_semantic_cache_embedding_model: openai-embedding # the model should be defined on the model_list qdrant_collection_name: test_collection qdrant_quantization_config: binary + qdrant_semantic_cache_vector_size: 1536 # vector size must match embedding model dimensionality similarity_threshold: 0.8 # similarity threshold for semantic cache # Optional - S3 Cache Settings diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py index a03bff606866..ad02d2ea891b 100644 --- a/litellm/caching/caching.py +++ b/litellm/caching/caching.py @@ -108,6 +108,7 @@ def __init__( qdrant_collection_name: Optional[str] = None, qdrant_quantization_config: Optional[str] = None, qdrant_semantic_cache_embedding_model: str = "text-embedding-ada-002", + qdrant_semantic_cache_vector_size: Optional[int] = None, # GCP IAM authentication parameters gcp_service_account: Optional[str] = None, gcp_ssl_ca_certs: Optional[str] = None, @@ -207,6 +208,7 @@ def __init__( similarity_threshold=similarity_threshold, quantization_config=qdrant_quantization_config, embedding_model=qdrant_semantic_cache_embedding_model, + vector_size=qdrant_semantic_cache_vector_size, ) elif type == LiteLLMCacheType.LOCAL: self.cache = InMemoryCache() diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py index 0e77b5a6c211..181effa01d4b 100644 --- a/litellm/caching/qdrant_semantic_cache.py +++ b/litellm/caching/qdrant_semantic_cache.py @@ -31,6 +31,7 @@ def __init__( # noqa: PLR0915 quantization_config=None, embedding_model="text-embedding-ada-002", host_type=None, + vector_size=None, ): import os @@ -53,6 +54,7 @@ def __init__( # noqa: PLR0915 raise Exception("similarity_threshold must be provided, passed None") self.similarity_threshold = similarity_threshold self.embedding_model = embedding_model + self.vector_size = vector_size if vector_size is not None else QDRANT_VECTOR_SIZE headers = {} # check if defined as os.environ/ variable @@ -138,7 +140,7 @@ def __init__( # noqa: PLR0915 new_collection_status = self.sync_client.put( url=f"{self.qdrant_api_base}/collections/{self.collection_name}", json={ - "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"}, + "vectors": {"size": self.vector_size, "distance": "Cosine"}, "quantization_config": quantization_params, }, headers=self.headers, diff --git a/tests/test_litellm/caching/test_qdrant_semantic_cache.py b/tests/test_litellm/caching/test_qdrant_semantic_cache.py index e7d934bf0e6f..fe6830693d66 100644 --- a/tests/test_litellm/caching/test_qdrant_semantic_cache.py +++ b/tests/test_litellm/caching/test_qdrant_semantic_cache.py @@ -408,4 +408,131 @@ async def test_qdrant_semantic_cache_async_set_cache(): ) # Verify async upsert was called - qdrant_cache.async_client.put.assert_called() \ No newline at end of file + qdrant_cache.async_client.put.assert_called() + +def test_qdrant_semantic_cache_custom_vector_size(): + """ + Test that QdrantSemanticCache uses a custom vector_size when creating a new collection. + Verifies that the vector size passed to the constructor is used in the Qdrant collection + creation payload instead of the default 1536. + """ + with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \ + patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client: + + # Mock the collection does NOT exist (so it will be created) + mock_exists_response = MagicMock() + mock_exists_response.status_code = 200 + mock_exists_response.json.return_value = {"result": {"exists": False}} + + # Mock the collection creation response + mock_create_response = MagicMock() + mock_create_response.status_code = 200 + mock_create_response.json.return_value = {"result": True} + + # Mock the collection details response after creation + mock_details_response = MagicMock() + mock_details_response.status_code = 200 + mock_details_response.json.return_value = {"result": {"status": "ok"}} + + mock_sync_client_instance = MagicMock() + mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response] + mock_sync_client_instance.put.return_value = mock_create_response + mock_sync_client.return_value = mock_sync_client_instance + + from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache + + # Initialize with custom vector_size of 768 + qdrant_cache = QdrantSemanticCache( + collection_name="test_collection_768", + qdrant_api_base="http://test.qdrant.local", + qdrant_api_key="test_key", + similarity_threshold=0.8, + vector_size=768, + ) + + # Verify the vector_size attribute is set correctly + assert qdrant_cache.vector_size == 768 + + # Verify the PUT call to create the collection used vector_size=768 + put_call = mock_sync_client_instance.put.call_args + assert put_call is not None + create_payload = put_call.kwargs.get("json") or put_call[1].get("json") + assert create_payload["vectors"]["size"] == 768 + assert create_payload["vectors"]["distance"] == "Cosine" + + +def test_qdrant_semantic_cache_default_vector_size(): + """ + Test that QdrantSemanticCache defaults to QDRANT_VECTOR_SIZE (1536) when vector_size + is not provided, and stores it as self.vector_size. + """ + with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \ + patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client: + + # Mock the collection exists check + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = {"result": {"exists": True}} + + mock_sync_client_instance = MagicMock() + mock_sync_client_instance.get.return_value = mock_response + mock_sync_client.return_value = mock_sync_client_instance + + from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache + from litellm.constants import QDRANT_VECTOR_SIZE + + # Initialize without vector_size + qdrant_cache = QdrantSemanticCache( + collection_name="test_collection", + qdrant_api_base="http://test.qdrant.local", + qdrant_api_key="test_key", + similarity_threshold=0.8, + ) + + # Verify it falls back to the default QDRANT_VECTOR_SIZE constant + assert qdrant_cache.vector_size == QDRANT_VECTOR_SIZE + + +def test_qdrant_semantic_cache_large_vector_size(): + """ + Test that QdrantSemanticCache supports large embedding dimensions (e.g. 4096, 8192) + for models like Stella, bge-en-icl, etc. + """ + with patch("litellm.llms.custom_httpx.http_handler._get_httpx_client") as mock_sync_client, \ + patch("litellm.llms.custom_httpx.http_handler.get_async_httpx_client") as mock_async_client: + + # Mock the collection does NOT exist (so it will be created) + mock_exists_response = MagicMock() + mock_exists_response.status_code = 200 + mock_exists_response.json.return_value = {"result": {"exists": False}} + + mock_create_response = MagicMock() + mock_create_response.status_code = 200 + mock_create_response.json.return_value = {"result": True} + + mock_details_response = MagicMock() + mock_details_response.status_code = 200 + mock_details_response.json.return_value = {"result": {"status": "ok"}} + + mock_sync_client_instance = MagicMock() + mock_sync_client_instance.get.side_effect = [mock_exists_response, mock_details_response] + mock_sync_client_instance.put.return_value = mock_create_response + mock_sync_client.return_value = mock_sync_client_instance + + from litellm.caching.qdrant_semantic_cache import QdrantSemanticCache + + # Initialize with a large vector_size of 4096 + qdrant_cache = QdrantSemanticCache( + collection_name="test_collection_4096", + qdrant_api_base="http://test.qdrant.local", + qdrant_api_key="test_key", + similarity_threshold=0.8, + vector_size=4096, + ) + + assert qdrant_cache.vector_size == 4096 + + # Verify the collection was created with 4096 + put_call = mock_sync_client_instance.put.call_args + create_payload = put_call.kwargs.get("json") or put_call[1].get("json") + assert create_payload["vectors"]["size"] == 4096