From 2b3f9086d36a6bd0bc1a5037739c55ce62b1123c Mon Sep 17 00:00:00 2001
From: Prakhar Naval <prakhar@rockfish.ai>
Date: Sat, 21 Feb 2026 22:37:47 -0500
Subject: [PATCH 1/4] fix(proxy): preserve actual model name in response, not
 alias

---
 litellm/proxy/common_request_processing.py    | 114 +++++-------------
 litellm/proxy/proxy_server.py                 |  68 ++++-------
 .../proxy/test_response_model_sanitization.py |  49 +++++++-
 3 files changed, 98 insertions(+), 133 deletions(-)

diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
index 40fae4e4a56..8bbe9772353 100644
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@@ -44,7 +44,7 @@
 from litellm.proxy.route_llm_request import route_request
 from litellm.proxy.utils import ProxyLogging
 from litellm.router import Router
-from litellm.types.utils import ServerToolUse
+from litellm.types.utils import ServerToolUse, LlmProvidersSet
 
 # Type alias for streaming chunk serializer (chunk after hooks + cost injection -> wire format)
 StreamChunkSerializer = Callable[[Any], str]
@@ -248,85 +248,41 @@ async def combined_generator() -> AsyncGenerator[str, None]:
 def _override_openai_response_model(
     *,
     response_obj: Any,
-    requested_model: str,
     log_context: str,
 ) -> None:
     """
-    Force the OpenAI-compatible `model` field in the response to match what the client requested.
+    Strip known LiteLLM provider prefixes (e.g. hosted_vllm/) from the response model field.
 
-    LiteLLM internally prefixes some provider/deployment model identifiers (e.g. `hosted_vllm/...`).
-    That internal identifier should not be returned to clients in the OpenAI `model` field.
-
-    Note: This is intentionally verbose. A model mismatch is a useful signal that an internal
-    model identifier is being stamped/preserved somewhere in the request/response pipeline.
-    We log mismatches as warnings (and then restamp to the client-requested value) so these
-    paths stay observable for maintainers/operators without breaking client compatibility.
-
-    Errors are reserved for cases where the proxy cannot read/override the response model field.
-
-    Exception: If a fallback occurred (indicated by x-litellm-attempted-fallbacks header),
-    we should preserve the actual model that was used (the fallback model) rather than
-    overriding it with the originally requested model.
+    Previously this replaced response.model with the client-requested alias, but that
+    hid the actual model name from callers (see #21665). Now we only strip internal
+    provider routing prefixes, preserving the real model name.
     """
-    if not requested_model:
+    if isinstance(response_obj, dict):
+        downstream_model = response_obj.get("model")
+    elif hasattr(response_obj, "model"):
+        downstream_model = getattr(response_obj, "model", None)
+    else:
         return
 
-    # Check if a fallback occurred - if so, preserve the actual model used
-    hidden_params = getattr(response_obj, "_hidden_params", {}) or {}
-    if isinstance(hidden_params, dict):
-        fallback_headers = hidden_params.get("additional_headers", {}) or {}
-        attempted_fallbacks = fallback_headers.get(
-            "x-litellm-attempted-fallbacks", None
-        )
-        if attempted_fallbacks is not None and attempted_fallbacks > 0:
-            # A fallback occurred - preserve the actual model that was used
-            verbose_proxy_logger.debug(
-                "%s: fallback detected (attempted_fallbacks=%d), preserving actual model used instead of overriding to requested model.",
-                log_context,
-                attempted_fallbacks,
-            )
-            return
+    if not downstream_model or not isinstance(downstream_model, str):
+        return
 
-    if isinstance(response_obj, dict):
-        downstream_model = response_obj.get("model")
-        if downstream_model != requested_model:
-            verbose_proxy_logger.debug(
-                "%s: response model mismatch - requested=%r downstream=%r. Overriding response['model'] to requested model.",
-                log_context,
-                requested_model,
-                downstream_model,
-            )
-        response_obj["model"] = requested_model
+    if "/" not in downstream_model:
         return
 
-    if not hasattr(response_obj, "model"):
-        verbose_proxy_logger.error(
-            "%s: cannot override response model; missing `model` attribute. response_type=%s",
-            log_context,
-            type(response_obj),
-        )
+    prefix = downstream_model.split("/", 1)[0]
+    if prefix not in LlmProvidersSet:
         return
 
-    downstream_model = getattr(response_obj, "model", None)
-    if downstream_model != requested_model:
-        verbose_proxy_logger.debug(
-            "%s: response model mismatch - requested=%r downstream=%r. Overriding response.model to requested model.",
-            log_context,
-            requested_model,
-            downstream_model,
-        )
+    stripped = downstream_model.split("/", 1)[1]
 
-    try:
-        setattr(response_obj, "model", requested_model)
-    except Exception as e:
-        verbose_proxy_logger.error(
-            "%s: failed to override response.model=%r on response_type=%s. error=%s",
-            log_context,
-            requested_model,
-            type(response_obj),
-            str(e),
-            exc_info=True,
-        )
+    if isinstance(response_obj, dict):
+        response_obj["model"] = stripped
+    else:
+        try:
+            response_obj.model = stripped
+        except Exception:
+            pass
 
 
 def _get_cost_breakdown_from_logging_obj(
@@ -809,9 +765,6 @@ async def base_process_llm_request(
         """
         Common request processing logic for both chat completions and responses API endpoints
         """
-        requested_model_from_client: Optional[str] = (
-            self.data.get("model") if isinstance(self.data.get("model"), str) else None
-        )
         self._debug_log_request_payload()
 
         self.data, logging_obj = await self.common_processing_pre_call_logic(
@@ -918,14 +871,6 @@ async def base_process_llm_request(
             if callback_headers:
                 custom_headers.update(callback_headers)
 
-            # Preserve the original client-requested model (pre-alias mapping) for downstream
-            # streaming generators. Pre-call processing can rewrite `self.data["model"]` for
-            # aliasing/routing, but the OpenAI-compatible response `model` field should reflect
-            # what the client sent.
-            if requested_model_from_client:
-                self.data[
-                    "_litellm_client_requested_model"
-                ] = requested_model_from_client
             if route_type == "allm_passthrough_route":
                 # Check if response is an async generator
                 if self._is_streaming_response(response):
@@ -985,14 +930,11 @@ async def base_process_llm_request(
             data=self.data, user_api_key_dict=user_api_key_dict, response=response
         )
 
-        # Always return the client-requested model name (not provider-prefixed internal identifiers)
-        # for OpenAI-compatible responses.
-        if requested_model_from_client:
-            _override_openai_response_model(
-                response_obj=response,
-                requested_model=requested_model_from_client,
-                log_context=f"litellm_call_id={logging_obj.litellm_call_id}",
-            )
+        # Strip any internal provider prefixes from the response model field.
+        _override_openai_response_model(
+            response_obj=response,
+            log_context=f"litellm_call_id={logging_obj.litellm_call_id}",
+        )
 
         hidden_params = (
             getattr(response, "_hidden_params", {}) or {}
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 82cfd455be6..2e554f8241c 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -98,6 +98,7 @@
     ModelResponseStream,
     TextCompletionResponse,
     TokenCountResponse,
+    LlmProvidersSet,
 )
 from litellm.utils import (
     _invalidate_model_cost_lowercase_map,
@@ -5229,64 +5230,43 @@ async def async_assistants_data_generator(
         yield f"data: {error_returned}\n\n"
 
 
-def _get_client_requested_model_for_streaming(request_data: dict) -> str:
-    """
-    Prefer the original client-requested model (pre-alias mapping) when available.
-
-    Pre-call processing can rewrite `request_data["model"]` for aliasing/routing purposes.
-    The OpenAI-compatible public `model` field should reflect what the client sent.
-    """
-    requested_model = request_data.get("_litellm_client_requested_model")
-    if isinstance(requested_model, str):
-        return requested_model
-
-    requested_model = request_data.get("model")
-    return requested_model if isinstance(requested_model, str) else ""
-
-
 def _restamp_streaming_chunk_model(
     *,
     chunk: Any,
-    requested_model_from_client: str,
     request_data: dict,
     model_mismatch_logged: bool,
 ) -> Tuple[Any, bool]:
-    # Always return the client-requested model name (not provider-prefixed internal identifiers)
-    # on streaming chunks.
-    #
-    # Note: This warning is intentionally verbose. A mismatch is a useful signal that an
-    # internal provider/deployment identifier is leaking into the public API, and helps
-    # maintainers/operators catch regressions while preserving OpenAI-compatible output.
-    if not requested_model_from_client or not isinstance(chunk, (BaseModel, dict)):
+    """Strip known provider prefixes from streaming chunk model field."""
+    if not isinstance(chunk, (BaseModel, dict)):
         return chunk, model_mismatch_logged
 
     downstream_model = (
         chunk.get("model") if isinstance(chunk, dict) else getattr(chunk, "model", None)
     )
-    if not model_mismatch_logged and downstream_model != requested_model_from_client:
+
+    if not downstream_model or not isinstance(downstream_model, str) or "/" not in downstream_model:
+        return chunk, model_mismatch_logged
+
+    prefix = downstream_model.split("/", 1)[0]
+    if prefix not in LlmProvidersSet:
+        return chunk, model_mismatch_logged
+
+    stripped = downstream_model.split("/", 1)[1]
+
+    if not model_mismatch_logged:
         verbose_proxy_logger.debug(
-            "litellm_call_id=%s: streaming chunk model mismatch - requested=%r downstream=%r. Overriding model to requested.",
-            request_data.get("litellm_call_id"),
-            requested_model_from_client,
-            downstream_model,
+            "litellm_call_id=%s: stripping provider prefix %r from chunk model %r",
+            request_data.get("litellm_call_id"), prefix, downstream_model,
         )
         model_mismatch_logged = True
 
     if isinstance(chunk, dict):
-        chunk["model"] = requested_model_from_client
-        return chunk, model_mismatch_logged
-
-    try:
-        setattr(chunk, "model", requested_model_from_client)
-    except Exception as e:
-        verbose_proxy_logger.error(
-            "litellm_call_id=%s: failed to override chunk.model=%r on chunk_type=%s. error=%s",
-            request_data.get("litellm_call_id"),
-            requested_model_from_client,
-            type(chunk),
-            str(e),
-            exc_info=True,
-        )
+        chunk["model"] = stripped
+    else:
+        try:
+            chunk.model = stripped
+        except Exception:
+            pass
 
     return chunk, model_mismatch_logged
 
@@ -5299,9 +5279,6 @@ async def async_data_generator(
         # Use a list to accumulate response segments to avoid O(n^2) string concatenation
         str_so_far_parts: list[str] = []
         error_message: Optional[str] = None
-        requested_model_from_client = _get_client_requested_model_for_streaming(
-            request_data=request_data
-        )
         model_mismatch_logged = False
         async for chunk in proxy_logging_obj.async_post_call_streaming_iterator_hook(
             user_api_key_dict=user_api_key_dict,
@@ -5322,7 +5299,6 @@ async def async_data_generator(
 
             chunk, model_mismatch_logged = _restamp_streaming_chunk_model(
                 chunk=chunk,
-                requested_model_from_client=requested_model_from_client,
                 request_data=request_data,
                 model_mismatch_logged=model_mismatch_logged,
             )
diff --git a/tests/test_litellm/proxy/test_response_model_sanitization.py b/tests/test_litellm/proxy/test_response_model_sanitization.py
index b1bb8d0ed39..2a9e1e395fe 100644
--- a/tests/test_litellm/proxy/test_response_model_sanitization.py
+++ b/tests/test_litellm/proxy/test_response_model_sanitization.py
@@ -213,5 +213,52 @@ async def _iterator_hook(
     assert first.startswith("data: ")
 
     payload = json.loads(first[len("data: ") :].strip())
-    assert payload["model"] == client_model_alias
+    assert payload["model"] == canonical_model
     assert not payload["model"].startswith("hosted_vllm/")
+
+
+def test_proxy_chat_completion_returns_actual_model_not_alias(tmp_path, monkeypatch):
+    """
+    Regression test for GitHub issue #21665:
+
+    Proxy should return actual model name, not the model_list alias.
+    """
+    alias_model = "default"
+    actual_model = "global.anthropic.claude-sonnet-4-5-20250929-v1:0"
+    internal_model = f"bedrock/{actual_model}"
+
+    client = _initialize_proxy_with_config(
+        config={
+            "general_settings": {"master_key": "sk-1234"},
+            "model_list": [
+                {
+                    "model_name": alias_model,
+                    "litellm_params": {"model": internal_model},
+                }
+            ],
+        },
+        tmp_path=tmp_path,
+    )
+
+    from litellm.proxy import proxy_server
+
+    monkeypatch.setattr(
+        proxy_server.llm_router,
+        "acompletion",
+        AsyncMock(return_value=_make_minimal_chat_completion_response(model=actual_model)),
+    )
+    monkeypatch.setattr(proxy_server.proxy_logging_obj, "during_call_hook", AsyncMock(return_value=None))
+    monkeypatch.setattr(proxy_server.proxy_logging_obj, "update_request_status", AsyncMock(return_value=None))
+    monkeypatch.setattr(proxy_server.proxy_logging_obj, "post_call_success_hook", AsyncMock(side_effect=lambda **kwargs: kwargs["response"]))
+
+    resp = client.post(
+        "/v1/chat/completions",
+        headers={"Authorization": "Bearer sk-1234"},
+        json={"model": alias_model, "messages": [{"role": "user", "content": "hi"}]},
+    )
+
+    assert resp.status_code == 200, resp.text
+    body = resp.json()
+    # Actual model name should be preserved, NOT the alias
+    assert body["model"] == actual_model
+    assert body["model"] != alias_model

From 746cb262c5385b2e2c2a898cb7924143a30c75a1 Mon Sep 17 00:00:00 2001
From: Prakhar Naval <prakhar@rockfish.ai>
Date: Sat, 21 Feb 2026 22:58:17 -0500
Subject: [PATCH 2/4] fix mypy attr defined error on BaseModel.model

---
 litellm/proxy/common_request_processing.py | 2 +-
 litellm/proxy/proxy_server.py              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
index 8bbe9772353..1e5126c09f6 100644
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@@ -280,7 +280,7 @@ def _override_openai_response_model(
         response_obj["model"] = stripped
     else:
         try:
-            response_obj.model = stripped
+            setattr(response_obj, "model", stripped)
         except Exception:
             pass
 
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 2e554f8241c..251dc4f4ff5 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -5264,7 +5264,7 @@ def _restamp_streaming_chunk_model(
         chunk["model"] = stripped
     else:
         try:
-            chunk.model = stripped
+            setattr(chunk, "model", stripped)
         except Exception:
             pass
 

From 3356a7648798237628b1ac4c25b58cfe341b686e Mon Sep 17 00:00:00 2001
From: Prakhar Naval <prakhar@rockfish.ai>
Date: Sat, 21 Feb 2026 23:05:59 -0500
Subject: [PATCH 3/4] update tests for new prefix stripping behavior

---
 .../proxy/test_common_request_processing.py   | 232 ++----------------
 .../proxy/test_response_model_sanitization.py |   7 +-
 2 files changed, 19 insertions(+), 220 deletions(-)

diff --git a/tests/test_litellm/proxy/test_common_request_processing.py b/tests/test_litellm/proxy/test_common_request_processing.py
index bf794478f10..0917efc2c6f 100644
--- a/tests/test_litellm/proxy/test_common_request_processing.py
+++ b/tests/test_litellm/proxy/test_common_request_processing.py
@@ -1131,225 +1131,29 @@ def test_extract_error_from_sse_chunk_with_minimal_error(self):
 
 
 class TestOverrideOpenAIResponseModel:
-    """Tests for _override_openai_response_model function"""
+    """Tests for _override_openai_response_model provider prefix stripping."""
 
-    def test_override_model_preserves_fallback_model_when_fallback_occurred_object(
-        self,
-    ):
-        """
-        Test that when a fallback occurred (x-litellm-attempted-fallbacks > 0),
-        the actual model used (fallback model) is preserved instead of being
-        overridden with the requested model.
-
-        This is the regression test to ensure the model being called is properly
-        displayed when a fallback happens.
-        """
-        requested_model = "gpt-4"
-        fallback_model = "gpt-3.5-turbo"
-
-        # Create a mock object response with fallback model
-        # _hidden_params is an attribute (not a dict key) accessed via getattr
+    def test_strips_known_provider_prefix_from_object(self):
         response_obj = MagicMock()
-        response_obj.model = fallback_model
-        response_obj._hidden_params = {
-            "additional_headers": {"x-litellm-attempted-fallbacks": 1}
-        }
-
-        # Call the function - should preserve fallback model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
+        response_obj.model = "hosted_vllm/my-model"
+        _override_openai_response_model(response_obj=response_obj, log_context="test")
+        assert response_obj.model == "my-model"
 
-        # Verify the model was NOT overridden - should still be the fallback model
-        assert response_obj.model == fallback_model
-        assert response_obj.model != requested_model
+    def test_strips_known_provider_prefix_from_dict(self):
+        response_obj = {"model": "bedrock/anthropic.claude-v2"}
+        _override_openai_response_model(response_obj=response_obj, log_context="test")
+        assert response_obj["model"] == "anthropic.claude-v2"
 
-    def test_override_model_preserves_fallback_model_multiple_fallbacks(self):
-        """
-        Test that when multiple fallbacks occurred, the actual model used
-        (fallback model) is preserved.
-        """
-        requested_model = "gpt-4"
-        fallback_model = "claude-haiku-4-5-20251001"
-
-        # Create a mock object response with fallback model
+    def test_leaves_model_without_prefix_alone(self):
         response_obj = MagicMock()
-        response_obj.model = fallback_model
-        response_obj._hidden_params = {
-            "additional_headers": {
-                "x-litellm-attempted-fallbacks": 2  # Multiple fallbacks
-            }
-        }
-
-        # Call the function - should preserve fallback model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model was NOT overridden - should still be the fallback model
-        assert response_obj.model == fallback_model
-        assert response_obj.model != requested_model
-
-    def test_override_model_overrides_when_no_fallback_dict(self):
-        """
-        Test that when no fallback occurred, the model is overridden
-        to match the requested model (dict response).
-        """
-        requested_model = "gpt-4"
-        downstream_model = "gpt-3.5-turbo"
-
-        # Create a dict response without fallback
-        # For dict responses, _hidden_params won't be found via getattr,
-        # so the fallback check won't trigger and model will be overridden
-        response_obj = {"model": downstream_model}
-
-        # Call the function - should override to requested model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model WAS overridden to requested model
-        assert response_obj["model"] == requested_model
-
-    def test_override_model_overrides_when_no_fallback_object(self):
-        """
-        Test that when no fallback occurred (object response), the model is overridden
-        to match the requested model.
-        """
-        requested_model = "gpt-4"
-        downstream_model = "gpt-3.5-turbo"
-
-        # Create a mock object response without fallback
-        response_obj = MagicMock()
-        response_obj.model = downstream_model
-        response_obj._hidden_params = {
-            "additional_headers": {}  # No attempted_fallbacks header
-        }
-
-        # Call the function - should override to requested model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model WAS overridden to requested model
-        assert response_obj.model == requested_model
-
-    def test_override_model_overrides_when_attempted_fallbacks_is_zero(self):
-        """
-        Test that when attempted_fallbacks is 0 (no fallback occurred),
-        the model is overridden to match the requested model.
-        """
-        requested_model = "gpt-4"
-        downstream_model = "gpt-3.5-turbo"
-
-        # Create a mock object response
-        response_obj = MagicMock()
-        response_obj.model = downstream_model
-        response_obj._hidden_params = {
-            "additional_headers": {
-                "x-litellm-attempted-fallbacks": 0  # Zero means no fallback occurred
-            }
-        }
-
-        # Call the function - should override to requested model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model WAS overridden to requested model
-        assert response_obj.model == requested_model
-
-    def test_override_model_overrides_when_attempted_fallbacks_is_none(self):
-        """
-        Test that when attempted_fallbacks is None (not set),
-        the model is overridden to match the requested model.
-        """
-        requested_model = "gpt-4"
-        downstream_model = "gpt-3.5-turbo"
-
-        # Create a mock object response
-        response_obj = MagicMock()
-        response_obj.model = downstream_model
-        response_obj._hidden_params = {
-            "additional_headers": {"x-litellm-attempted-fallbacks": None}
-        }
-
-        # Call the function - should override to requested model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model WAS overridden to requested model
-        assert response_obj.model == requested_model
-
-    def test_override_model_no_hidden_params(self):
-        """
-        Test that when _hidden_params is not present, the model is overridden
-        to match the requested model.
-        """
-        requested_model = "gpt-4"
-        downstream_model = "gpt-3.5-turbo"
-
-        # Create a mock object response without _hidden_params
-        response_obj = MagicMock()
-        response_obj.model = downstream_model
-        # Don't set _hidden_params - getattr will return {}
-
-        # Call the function - should override to requested model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=requested_model,
-            log_context="test_context",
-        )
-
-        # Verify the model WAS overridden to requested model
-        assert response_obj.model == requested_model
-
-    def test_override_model_no_requested_model(self):
-        """
-        Test that when requested_model is None or empty, the function returns early
-        without modifying the response.
-        """
-        fallback_model = "gpt-3.5-turbo"
-
-        # Create a mock object response
-        response_obj = MagicMock()
-        response_obj.model = fallback_model
-        response_obj._hidden_params = {
-            "additional_headers": {"x-litellm-attempted-fallbacks": 1}
-        }
-
-        # Call the function with None requested_model
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model=None,
-            log_context="test_context",
-        )
-
-        # Verify the model was not changed
-        assert response_obj.model == fallback_model
-
-        # Call with empty string
-        _override_openai_response_model(
-            response_obj=response_obj,
-            requested_model="",
-            log_context="test_context",
-        )
-
-        # Verify the model was not changed
-        assert response_obj.model == fallback_model
+        response_obj.model = "gpt-4"
+        _override_openai_response_model(response_obj=response_obj, log_context="test")
+        assert response_obj.model == "gpt-4"
+
+    def test_leaves_unknown_prefix_alone(self):
+        response_obj = {"model": "my-company/custom-model"}
+        _override_openai_response_model(response_obj=response_obj, log_context="test")
+        assert response_obj["model"] == "my-company/custom-model"
 
 
 class TestStreamingOverheadHeader:
diff --git a/tests/test_litellm/proxy/test_response_model_sanitization.py b/tests/test_litellm/proxy/test_response_model_sanitization.py
index 2a9e1e395fe..bdaa8159c55 100644
--- a/tests/test_litellm/proxy/test_response_model_sanitization.py
+++ b/tests/test_litellm/proxy/test_response_model_sanitization.py
@@ -166,11 +166,7 @@ async def _iterator_hook(
 @pytest.mark.asyncio
 async def test_proxy_streaming_chunks_use_client_requested_model_before_alias_mapping(monkeypatch):
     """
-    Regression test for alias mapping on streaming:
-
-    - `common_processing_pre_call_logic` can rewrite `request_data["model"]` via model_alias_map / key-specific aliases.
-    - Non-streaming responses are restamped using the original client-requested model (captured before the rewrite).
-    - Streaming chunks must do the same to avoid mismatched `model` values between streaming and non-streaming.
+    Streaming chunks should have provider prefixes stripped even when alias mapping is in play.
     """
     client_model_alias = "alias-model"
     canonical_model = "vllm-model"
@@ -200,7 +196,6 @@ async def _iterator_hook(
         user_api_key_dict=user_api_key_dict,
         request_data={
             "model": canonical_model,
-            "_litellm_client_requested_model": client_model_alias,
         },
     )
 

From 423fcc22416acf605d11e1f6e63fdfc04cb1be68 Mon Sep 17 00:00:00 2001
From: Prakhar Naval <prakhar@rockfish.ai>
Date: Sat, 21 Feb 2026 23:17:52 -0500
Subject: [PATCH 4/4] add nested-slash test and debug logging on strip failures

---
 litellm/proxy/common_request_processing.py                 | 7 +++++--
 litellm/proxy/proxy_server.py                              | 7 +++++--
 tests/test_litellm/proxy/test_common_request_processing.py | 4 ++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/litellm/proxy/common_request_processing.py b/litellm/proxy/common_request_processing.py
index 1e5126c09f6..2c7bb0c7414 100644
--- a/litellm/proxy/common_request_processing.py
+++ b/litellm/proxy/common_request_processing.py
@@ -281,8 +281,11 @@ def _override_openai_response_model(
     else:
         try:
             setattr(response_obj, "model", stripped)
-        except Exception:
-            pass
+        except Exception as e:
+            verbose_proxy_logger.debug(
+                "%s: failed to strip provider prefix on response.model, error=%s",
+                log_context, str(e),
+            )
 
 
 def _get_cost_breakdown_from_logging_obj(
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 251dc4f4ff5..961d78425f9 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -5265,8 +5265,11 @@ def _restamp_streaming_chunk_model(
     else:
         try:
             setattr(chunk, "model", stripped)
-        except Exception:
-            pass
+        except Exception as e:
+            verbose_proxy_logger.debug(
+                "litellm_call_id=%s: failed to strip provider prefix on chunk.model, error=%s",
+                request_data.get("litellm_call_id"), str(e),
+            )
 
     return chunk, model_mismatch_logged
 
diff --git a/tests/test_litellm/proxy/test_common_request_processing.py b/tests/test_litellm/proxy/test_common_request_processing.py
index 0917efc2c6f..7cf5aa32981 100644
--- a/tests/test_litellm/proxy/test_common_request_processing.py
+++ b/tests/test_litellm/proxy/test_common_request_processing.py
@@ -1155,6 +1155,10 @@ def test_leaves_unknown_prefix_alone(self):
         _override_openai_response_model(response_obj=response_obj, log_context="test")
         assert response_obj["model"] == "my-company/custom-model"
 
+    def test_strips_prefix_preserving_nested_slashes(self):
+        response_obj = {"model": "groq/meta-llama/llama-4-maverick-17b-128e-instruct"}
+        _override_openai_response_model(response_obj=response_obj, log_context="test")
+        assert response_obj["model"] == "meta-llama/llama-4-maverick-17b-128e-instruct"
 
 class TestStreamingOverheadHeader:
     """