diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 47967775e1e..b5ece7237aa 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -828,7 +828,12 @@ asyncio.run(router_acompletion()) ``` - + +## Traffic Mirroring / Silent Experiments + +Traffic mirroring allows you to "mimic" production traffic to a secondary (silent) model for evaluation purposes. The silent model's response is gathered in the background and does not affect the latency or result of the primary request. + +[**See detailed guide on A/B Testing - Traffic Mirroring here**](./traffic_mirroring.md) ## Basic Reliability diff --git a/docs/my-website/docs/traffic_mirroring.md b/docs/my-website/docs/traffic_mirroring.md new file mode 100644 index 00000000000..3bdcb0f1614 --- /dev/null +++ b/docs/my-website/docs/traffic_mirroring.md @@ -0,0 +1,83 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# A/B Testing - Traffic Mirroring + +Traffic mirroring allows you to "mimic" production traffic to a secondary (silent) model for evaluation purposes. The silent model's response is gathered in the background and does not affect the latency or result of the primary request. + +This is useful for: +- Testing a new model's performance on production prompts before switching. +- Comparing costs and latency between different providers. +- Debugging issues by mirroring traffic to a more verbose model. + +## Quick Start + +To enable traffic mirroring, add `silent_model` to the `litellm_params` of a deployment. + + + + +```python +from litellm import Router + +model_list = [ + { + "model_name": "gpt-3.5-turbo", + "litellm_params": { + "model": "azure/chatgpt-v-2", + "api_key": "...", + "silent_model": "gpt-4" # 👈 Mirror traffic to gpt-4 + }, + }, + { + "model_name": "gpt-4", + "litellm_params": { + "model": "openai/gpt-4", + "api_key": "..." + }, + } +] + +router = Router(model_list=model_list) + +# The request to "gpt-3.5-turbo" will trigger a background call to "gpt-4" +response = await router.acompletion( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "How does traffic mirroring work?"}] +) +``` + + + + +Add `silent_model` to your `config.yaml`: + +```yaml +model_list: + - model_name: primary-model + litellm_params: + model: azure/gpt-35-turbo + api_key: os.environ/AZURE_API_KEY + silent_model: evaluation-model # 👈 Mirror traffic here + - model_name: evaluation-model + litellm_params: + model: openai/gpt-4o + api_key: os.environ/OPENAI_API_KEY +``` + + + + +## How it works +1. **Request Received**: A request is made to a model group (e.g. `primary-model`). +2. **Deployment Picked**: LiteLLM picks a deployment from the group. +3. **Primary Call**: LiteLLM makes the call to the primary deployment. +4. **Mirroring**: If `silent_model` is present, LiteLLM triggers a background call to that model. + - For **Sync** calls: Uses a shared thread pool. + - For **Async** calls: Uses `asyncio.create_task`. +5. **Isolation**: The background call uses a `deepcopy` of the original request parameters and sets `metadata["is_silent_experiment"] = True`. It also strips out logging IDs to prevent collisions in usage tracking. + +## Key Features +- **Latency Isolation**: The primary request returns as soon as it's ready. The background (silent) call does not block. +- **Unified Logging**: Background calls are processed via the Router, meaning they are automatically logged to your configured observability tools (Langfuse, S3, etc.). +- **Evaluation**: Use the `is_silent_experiment: True` flag in your logs to filter and compare results between the primary and mirrored calls. diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 38a26f6b183..9cbf2f81dcf 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -351,6 +351,7 @@ const sidebars = { label: "Load Balancing, Routing, Fallbacks", href: "https://docs.litellm.ai/docs/routing-load-balancing", }, + "traffic_mirroring", { type: "category", label: "Logging, Alerting, Metrics", diff --git a/litellm/router.py b/litellm/router.py index 0b07d5ed8c1..38b486dcd8f 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -58,6 +58,7 @@ _get_parent_otel_span_from_kwargs, get_metadata_variable_name_from_kwargs, ) +from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.litellm_core_utils.coroutine_checker import coroutine_checker from litellm.litellm_core_utils.credential_accessor import CredentialAccessor from litellm.litellm_core_utils.dd_tracing import tracer @@ -1238,10 +1239,25 @@ def _completion( specific_deployment=kwargs.pop("specific_deployment", None), request_kwargs=kwargs, ) - self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs) + # Check for silent model experiment + # Make a local copy of litellm_params to avoid mutating the Router's state + litellm_params = deployment["litellm_params"].copy() + silent_model = litellm_params.pop("silent_model", None) + + if silent_model is not None: + # Mirroring traffic to a secondary model + # Use shared thread pool for background calls + executor.submit( + self._silent_experiment_completion, + silent_model, + messages, + **kwargs, + ) + self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs) + kwargs.pop("silent_model", None) # Ensure it's not in kwargs either # No copy needed - data is only read and spread into new dict below - data = deployment["litellm_params"] + data = litellm_params.copy() # Use the local copy of litellm_params model_name = data["model"] potential_model_client = self._get_client( deployment=deployment, kwargs=kwargs @@ -1262,15 +1278,14 @@ def _completion( if not self.has_model_id(model): self.routing_strategy_pre_call_checks(deployment=deployment) - response = litellm.completion( - **{ - **data, - "messages": messages, - "caching": self.cache_responses, - "client": model_client, - **kwargs, - } - ) + input_kwargs = { + **data, + "messages": messages, + "caching": self.cache_responses, + "client": model_client, + **kwargs, + } + response = litellm.completion(**input_kwargs) verbose_router_logger.info( f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m" ) @@ -1297,6 +1312,56 @@ def _completion( self._set_deployment_num_retries_on_exception(e, deployment) raise e + def _get_silent_experiment_kwargs(self, **kwargs) -> dict: + """ + Prepare kwargs for a silent experiment by ensuring isolation from the primary call. + """ + # Copy kwargs to ensure isolation + silent_kwargs = copy.deepcopy(kwargs) + if "metadata" not in silent_kwargs: + silent_kwargs["metadata"] = {} + + silent_kwargs["metadata"]["is_silent_experiment"] = True + + # Pop logging objects and call IDs to ensure a fresh logging context + # This prevents collisions in the Proxy's database (spend_logs) + silent_kwargs.pop("litellm_call_id", None) + silent_kwargs.pop("litellm_logging_obj", None) + silent_kwargs.pop("standard_logging_object", None) + silent_kwargs.pop("proxy_server_request", None) + + return silent_kwargs + + def _silent_experiment_completion( + self, silent_model: str, messages: List[Any], **kwargs + ): + """ + Run a silent experiment in the background (thread). + """ + try: + # Prevent infinite recursion if silent model also has a silent model + if kwargs.get("metadata", {}).get("is_silent_experiment", False): + return + + messages = copy.deepcopy(messages) + + verbose_router_logger.info( + f"Starting silent experiment for model {silent_model}" + ) + + silent_kwargs = self._get_silent_experiment_kwargs(**kwargs) + + # Trigger the silent request + self.completion( + model=silent_model, + messages=cast(List[Dict[str, str]], messages), + **silent_kwargs, + ) + except Exception as e: + verbose_router_logger.error( + f"Silent experiment failed for model {silent_model}: {str(e)}" + ) + # fmt: off @overload @@ -1505,6 +1570,36 @@ async def stream_with_fallbacks(): return FallbackStreamWrapper(stream_with_fallbacks()) + async def _silent_experiment_acompletion( + self, silent_model: str, messages: List[Any], **kwargs + ): + """ + Run a silent experiment in the background. + """ + try: + # Prevent infinite recursion if silent model also has a silent model + if kwargs.get("metadata", {}).get("is_silent_experiment", False): + return + + messages = copy.deepcopy(messages) + + verbose_router_logger.info( + f"Starting silent experiment for model {silent_model}" + ) + + silent_kwargs = self._get_silent_experiment_kwargs(**kwargs) + + # Trigger the silent request + await self.acompletion( + model=silent_model, + messages=cast(List[AllMessageValues], messages), + **silent_kwargs, + ) + except Exception as e: + verbose_router_logger.error( + f"Silent experiment failed for model {silent_model}: {str(e)}" + ) + async def _acompletion( # noqa: PLR0915 self, model: str, messages: List[Dict[str, str]], **kwargs ) -> Union[ModelResponse, CustomStreamWrapper,]: @@ -1551,9 +1646,27 @@ async def _acompletion( # noqa: PLR0915 self._track_deployment_metrics( deployment=deployment, parent_otel_span=parent_otel_span ) + + # Check for silent model experiment + # Make a local copy of litellm_params to avoid mutating the Router's state + litellm_params = deployment["litellm_params"].copy() + silent_model = litellm_params.pop("silent_model", None) + + if silent_model is not None: + # Mirroring traffic to a secondary model + # This is a silent experiment, so we don't want to block the primary request + asyncio.create_task( + self._silent_experiment_acompletion( + silent_model=silent_model, + messages=messages, # Use messages instead of *args + **kwargs, + ) + ) + self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs) + kwargs.pop("silent_model", None) # Ensure it's not in kwargs either # No copy needed - data is only read and spread into new dict below - data = deployment["litellm_params"] + data = litellm_params.copy() # Use the local copy of litellm_params model_name = data["model"] @@ -1570,6 +1683,7 @@ async def _acompletion( # noqa: PLR0915 "client": model_client, **kwargs, } + input_kwargs.pop("silent_model", None) _response = litellm.acompletion(**input_kwargs) diff --git a/tests/test_litellm/test_router_silent_experiment.py b/tests/test_litellm/test_router_silent_experiment.py new file mode 100644 index 00000000000..9b82cde13c6 --- /dev/null +++ b/tests/test_litellm/test_router_silent_experiment.py @@ -0,0 +1,157 @@ +import asyncio +from unittest.mock import MagicMock, patch + +import pytest + +import litellm +from litellm.router import Router + + +@pytest.mark.asyncio +async def test_router_silent_experiment_acompletion(): + """ + Test that silent_model triggers a background acompletion call + and that the silent_model parameter is stripped from both calls. + """ + model_list = [ + { + "model_name": "primary-model", + "litellm_params": { + "model": "openai/gpt-3.5-turbo", + "api_key": "fake-key", + "silent_model": "silent-model", + }, + }, + { + "model_name": "silent-model", + "litellm_params": { + "model": "openai/gpt-4", + "api_key": "fake-key", + }, + }, + ] + + router = Router(model_list=model_list) + + # Mock litellm.acompletion + mock_acompletion = MagicMock() + # Create a future that resolves to a ModelResponse + mock_response = litellm.ModelResponse(choices=[{"message": {"content": "hello"}}]) + future = asyncio.Future() + future.set_result(mock_response) + mock_acompletion.return_value = future + + with patch("litellm.acompletion", mock_acompletion): + response = await router.acompletion( + model="primary-model", + messages=[{"role": "user", "content": "hi"}], + ) + + assert response.choices[0].message.content == "hello" + + # Give the background task a moment to trigger (it's an asyncio task) + await asyncio.sleep(0.1) + + # Should have 2 calls: one for primary, one for silent + assert mock_acompletion.call_count == 2 + + # Check call arguments + call_args_list = mock_acompletion.call_args_list + + # Verify no silent_model in any call to litellm.acompletion + for call in call_args_list: + args, kwargs = call + assert "silent_model" not in kwargs + if "metadata" in kwargs: + # One call should have is_silent_experiment=True + pass + + # Find the silent call + silent_call = next( + ( + c + for c in call_args_list + if c[1].get("metadata", {}).get("is_silent_experiment") is True + ), + None, + ) + assert silent_call is not None + assert silent_call[1]["model"] == "openai/gpt-4" + + # Find the primary call + primary_call = next( + ( + c + for c in call_args_list + if not c[1].get("metadata", {}).get("is_silent_experiment") + ), + None, + ) + assert primary_call is not None + assert primary_call[1]["model"] == "openai/gpt-3.5-turbo" + + +def test_router_silent_experiment_completion(): + """ + Test that silent_model triggers a background completion call (sync) + and that the silent_model parameter is stripped. + """ + model_list = [ + { + "model_name": "primary-model", + "litellm_params": { + "model": "openai/gpt-3.5-turbo", + "api_key": "fake-key", + "silent_model": "silent-model", + }, + }, + { + "model_name": "silent-model", + "litellm_params": { + "model": "openai/gpt-4", + "api_key": "fake-key", + }, + }, + ] + + router = Router(model_list=model_list) + + # Mock litellm.completion + mock_completion = MagicMock() + mock_response = litellm.ModelResponse(choices=[{"message": {"content": "hello"}}]) + mock_completion.return_value = mock_response + + with patch("litellm.completion", mock_completion): + response = router.completion( + model="primary-model", + messages=[{"role": "user", "content": "hi"}], + ) + + assert response.choices[0].message.content == "hello" + + # The sync background call uses a thread pool. We might need to wait a bit. + import time + + time.sleep(0.5) + + # Should have 2 calls + assert mock_completion.call_count == 2 + + call_args_list = mock_completion.call_args_list + + # Verify no silent_model in any call + for call in call_args_list: + args, kwargs = call + assert "silent_model" not in kwargs + + # Find the silent call + silent_call = next( + ( + c + for c in call_args_list + if c[1].get("metadata", {}).get("is_silent_experiment") is True + ), + None, + ) + assert silent_call is not None + assert silent_call[1]["model"] == "openai/gpt-4"