BerriAI · ishaan-jaff · Jan 27, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md
@@ -828,7 +828,12 @@ asyncio.run(router_acompletion())
 ```
 
 </TabItem>
-</Tabs>
+
+## Traffic Mirroring / Silent Experiments
+
+Traffic mirroring allows you to "mimic" production traffic to a secondary (silent) model for evaluation purposes. The silent model's response is gathered in the background and does not affect the latency or result of the primary request.
+
+[**See detailed guide on A/B Testing - Traffic Mirroring here**](./traffic_mirroring.md)
 
 ## Basic Reliability
 

diff --git a/docs/my-website/docs/traffic_mirroring.md b/docs/my-website/docs/traffic_mirroring.md
@@ -0,0 +1,83 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# A/B Testing - Traffic Mirroring
+
+Traffic mirroring allows you to "mimic" production traffic to a secondary (silent) model for evaluation purposes. The silent model's response is gathered in the background and does not affect the latency or result of the primary request.
+
+This is useful for:
+- Testing a new model's performance on production prompts before switching.
+- Comparing costs and latency between different providers.
+- Debugging issues by mirroring traffic to a more verbose model.
+
+## Quick Start
+
+To enable traffic mirroring, add `silent_model` to the `litellm_params` of a deployment.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import Router
+
+model_list = [
+    {
+        "model_name": "gpt-3.5-turbo",
+        "litellm_params": {
+            "model": "azure/chatgpt-v-2",
+            "api_key": "...",
+            "silent_model": "gpt-4" # 👈 Mirror traffic to gpt-4
+        },
+    },
+    {
+        "model_name": "gpt-4",
+        "litellm_params": {
+            "model": "openai/gpt-4",
+            "api_key": "..."
+        },
+    }
+]
+
+router = Router(model_list=model_list)
+
+# The request to "gpt-3.5-turbo" will trigger a background call to "gpt-4"
+response = await router.acompletion(
+    model="gpt-3.5-turbo",
+    messages=[{"role": "user", "content": "How does traffic mirroring work?"}]
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="Proxy">
+
+Add `silent_model` to your `config.yaml`:
+
+```yaml
+model_list:
+  - model_name: primary-model
+    litellm_params:
+      model: azure/gpt-35-turbo
+      api_key: os.environ/AZURE_API_KEY
+      silent_model: evaluation-model # 👈 Mirror traffic here
+  - model_name: evaluation-model
+    litellm_params:
+      model: openai/gpt-4o
+      api_key: os.environ/OPENAI_API_KEY
+```
+
+</TabItem>
+</Tabs>
+
+## How it works
+1. **Request Received**: A request is made to a model group (e.g. `primary-model`).
+2. **Deployment Picked**: LiteLLM picks a deployment from the group.
+3. **Primary Call**: LiteLLM makes the call to the primary deployment.
+4. **Mirroring**: If `silent_model` is present, LiteLLM triggers a background call to that model. 
+   - For **Sync** calls: Uses a shared thread pool.
+   - For **Async** calls: Uses `asyncio.create_task`.
+5. **Isolation**: The background call uses a `deepcopy` of the original request parameters and sets `metadata["is_silent_experiment"] = True`. It also strips out logging IDs to prevent collisions in usage tracking.
+
+## Key Features
+- **Latency Isolation**: The primary request returns as soon as it's ready. The background (silent) call does not block.
+- **Unified Logging**: Background calls are processed via the Router, meaning they are automatically logged to your configured observability tools (Langfuse, S3, etc.).
+- **Evaluation**: Use the `is_silent_experiment: True` flag in your logs to filter and compare results between the primary and mirrored calls.
diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js
@@ -351,6 +351,7 @@ const sidebars = {
           label: "Load Balancing, Routing, Fallbacks",
           href: "https://docs.litellm.ai/docs/routing-load-balancing",
         },
+        "traffic_mirroring",
         {
           type: "category",
           label: "Logging, Alerting, Metrics",

diff --git a/litellm/router.py b/litellm/router.py
@@ -58,6 +58,7 @@
     _get_parent_otel_span_from_kwargs,
     get_metadata_variable_name_from_kwargs,
 )
+from litellm.litellm_core_utils.thread_pool_executor import executor
 from litellm.litellm_core_utils.coroutine_checker import coroutine_checker
 from litellm.litellm_core_utils.credential_accessor import CredentialAccessor
 from litellm.litellm_core_utils.dd_tracing import tracer
@@ -1238,10 +1239,25 @@ def _completion(
                 specific_deployment=kwargs.pop("specific_deployment", None),
                 request_kwargs=kwargs,
             )
-            self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
+            # Check for silent model experiment
+            # Make a local copy of litellm_params to avoid mutating the Router's state
+            litellm_params = deployment["litellm_params"].copy()
+            silent_model = litellm_params.pop("silent_model", None)
+
+            if silent_model is not None:
+                # Mirroring traffic to a secondary model
+                # Use shared thread pool for background calls
+                executor.submit(
+                    self._silent_experiment_completion,
+                    silent_model,
+                    messages,
+                    **kwargs,
+                )
 
+            self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
+            kwargs.pop("silent_model", None)  # Ensure it's not in kwargs either
             # No copy needed - data is only read and spread into new dict below
-            data = deployment["litellm_params"]
+            data = litellm_params.copy()  # Use the local copy of litellm_params
             model_name = data["model"]
             potential_model_client = self._get_client(
                 deployment=deployment, kwargs=kwargs
@@ -1262,15 +1278,14 @@ def _completion(
             if not self.has_model_id(model):
                 self.routing_strategy_pre_call_checks(deployment=deployment)
 
-            response = litellm.completion(
-                **{
-                    **data,
-                    "messages": messages,
-                    "caching": self.cache_responses,
-                    "client": model_client,
-                    **kwargs,
-                }
-            )
+            input_kwargs = {
+                **data,
+                "messages": messages,
+                "caching": self.cache_responses,
+                "client": model_client,
+                **kwargs,
+            }
+            response = litellm.completion(**input_kwargs)
             verbose_router_logger.info(
                 f"litellm.completion(model={model_name})\033[32m 200 OK\033[0m"
             )
@@ -1297,6 +1312,56 @@ def _completion(
                 self._set_deployment_num_retries_on_exception(e, deployment)
             raise e
 
+    def _get_silent_experiment_kwargs(self, **kwargs) -> dict:
+        """
+        Prepare kwargs for a silent experiment by ensuring isolation from the primary call.
+        """
+        # Copy kwargs to ensure isolation
+        silent_kwargs = copy.deepcopy(kwargs)
+        if "metadata" not in silent_kwargs:
+            silent_kwargs["metadata"] = {}
+
+        silent_kwargs["metadata"]["is_silent_experiment"] = True
+
+        # Pop logging objects and call IDs to ensure a fresh logging context
+        # This prevents collisions in the Proxy's database (spend_logs)
+        silent_kwargs.pop("litellm_call_id", None)
+        silent_kwargs.pop("litellm_logging_obj", None)
+        silent_kwargs.pop("standard_logging_object", None)
+        silent_kwargs.pop("proxy_server_request", None)
+
+        return silent_kwargs
+
+    def _silent_experiment_completion(
+        self, silent_model: str, messages: List[Any], **kwargs
+    ):
+        """
+        Run a silent experiment in the background (thread).
+        """
+        try:
+            # Prevent infinite recursion if silent model also has a silent model
+            if kwargs.get("metadata", {}).get("is_silent_experiment", False):
+                return
+
+            messages = copy.deepcopy(messages)
+
+            verbose_router_logger.info(
+                f"Starting silent experiment for model {silent_model}"
+            )
+
+            silent_kwargs = self._get_silent_experiment_kwargs(**kwargs)
+
+            # Trigger the silent request
+            self.completion(
+                model=silent_model,
+                messages=cast(List[Dict[str, str]], messages),
+                **silent_kwargs,
+            )
+        except Exception as e:
+            verbose_router_logger.error(
+                f"Silent experiment failed for model {silent_model}: {str(e)}"
+            )
+
     # fmt: off
 
     @overload
@@ -1505,6 +1570,36 @@ async def stream_with_fallbacks():
 
         return FallbackStreamWrapper(stream_with_fallbacks())
 
+    async def _silent_experiment_acompletion(
+        self, silent_model: str, messages: List[Any], **kwargs
+    ):
+        """
+        Run a silent experiment in the background.
+        """
+        try:
+            # Prevent infinite recursion if silent model also has a silent model
+            if kwargs.get("metadata", {}).get("is_silent_experiment", False):
+                return
+
+            messages = copy.deepcopy(messages)
+
+            verbose_router_logger.info(
+                f"Starting silent experiment for model {silent_model}"
+            )
+
+            silent_kwargs = self._get_silent_experiment_kwargs(**kwargs)
+
+            # Trigger the silent request
+            await self.acompletion(
+                model=silent_model,
+                messages=cast(List[AllMessageValues], messages),
+                **silent_kwargs,
+            )
+        except Exception as e:
+            verbose_router_logger.error(
+                f"Silent experiment failed for model {silent_model}: {str(e)}"
+            )
+
     async def _acompletion(  # noqa: PLR0915
         self, model: str, messages: List[Dict[str, str]], **kwargs
     ) -> Union[ModelResponse, CustomStreamWrapper,]:
@@ -1551,9 +1646,27 @@ async def _acompletion(  # noqa: PLR0915
             self._track_deployment_metrics(
                 deployment=deployment, parent_otel_span=parent_otel_span
             )
+
+            # Check for silent model experiment
+            # Make a local copy of litellm_params to avoid mutating the Router's state
+            litellm_params = deployment["litellm_params"].copy()
+            silent_model = litellm_params.pop("silent_model", None)
+
+            if silent_model is not None:
+                # Mirroring traffic to a secondary model
+                # This is a silent experiment, so we don't want to block the primary request
+                asyncio.create_task(
+                    self._silent_experiment_acompletion(
+                        silent_model=silent_model,
+                        messages=messages,  # Use messages instead of *args
+                        **kwargs,
+                    )
+                )
+
             self._update_kwargs_with_deployment(deployment=deployment, kwargs=kwargs)
+            kwargs.pop("silent_model", None)  # Ensure it's not in kwargs either
             # No copy needed - data is only read and spread into new dict below
-            data = deployment["litellm_params"]
+            data = litellm_params.copy()  # Use the local copy of litellm_params
 
             model_name = data["model"]
 
@@ -1570,6 +1683,7 @@ async def _acompletion(  # noqa: PLR0915
                 "client": model_client,
                 **kwargs,
             }
+            input_kwargs.pop("silent_model", None)
 
             _response = litellm.acompletion(**input_kwargs)