BerriAI · krrishdholakia · Aug 1, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/litellm/exceptions.py b/litellm/exceptions.py
@@ -829,3 +829,65 @@ def __init__(
         self.guardrail_name = guardrail_name
         self.message = f"Blocked entity detected: {entity_type} by Guardrail: {guardrail_name}. This entity is not allowed to be used in this request."
         super().__init__(self.message)
+
+
+class MidStreamFallbackError(ServiceUnavailableError):  # type: ignore
+    def __init__(
+        self,
+        message: str,
+        model: str,
+        llm_provider: str,
+        original_exception: Optional[Exception] = None,
+        response: Optional[httpx.Response] = None,
+        litellm_debug_info: Optional[str] = None,
+        max_retries: Optional[int] = None,
+        num_retries: Optional[int] = None,
+        generated_content: str = "",
+        is_pre_first_chunk: bool = False,
+    ):
+        self.status_code = 503  # Service Unavailable
+        self.message = f"litellm.MidStreamFallbackError: {message}"
+        self.model = model
+        self.llm_provider = llm_provider
+        self.original_exception = original_exception
+        self.litellm_debug_info = litellm_debug_info
+        self.max_retries = max_retries
+        self.num_retries = num_retries
+        self.generated_content = generated_content
+        self.is_pre_first_chunk = is_pre_first_chunk
+
+        # Create a response if one wasn't provided
+        if response is None:
+            self.response = httpx.Response(
+                status_code=self.status_code,
+                request=httpx.Request(
+                    method="POST",
+                    url=f"https://{llm_provider}.com/v1/",
+                ),
+            )
+        else:
+            self.response = response
+
+        # Call the parent constructor
+        super().__init__(
+            message=self.message,
+            llm_provider=llm_provider,
+            model=model,
+            response=self.response,
+            litellm_debug_info=self.litellm_debug_info,
+            max_retries=self.max_retries,
+            num_retries=self.num_retries,
+        )
+
+    def __str__(self):
+        _message = self.message
+        if self.num_retries:
+            _message += f" LiteLLM Retried: {self.num_retries} times"
+        if self.max_retries:
+            _message += f", LiteLLM Max Retries: {self.max_retries}"
+        if self.original_exception:
+            _message += f" Original exception: {type(self.original_exception).__name__}: {str(self.original_exception)}"
+        return _message
+
+    def __repr__(self):
+        return self.__str__()
diff --git a/litellm/litellm_core_utils/prompt_templates/common_utils.py b/litellm/litellm_core_utils/prompt_templates/common_utils.py
@@ -822,3 +822,41 @@ def set_last_user_message(
         messages.reverse()
     messages.append({"role": "user", "content": content})
     return messages
+
+
+def convert_prefix_message_to_non_prefix_messages(
+    messages: List[AllMessageValues],
+) -> List[AllMessageValues]:
+    """
+    For models that don't support {prefix: true} in messages, we need to convert the prefix message to a non-prefix message.
+
+    Use prompt:
+
+    {"role": "assistant", "content": "value", "prefix": true} -> [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant. You are given a message and you need to respond to it. You are also given a generated content. You need to respond to the message in continuation of the generated content. Do not repeat the same content. Your response should be in continuation of this text: ",
+        },
+        {
+            "role": "assistant",
+            "content": message["content"],
+        },
+    ]
+
+    do this in place
+    """
+    new_messages: List[AllMessageValues] = []
+    for message in messages:
+        if message.get("prefix"):
+            new_messages.append(
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant. You are given a message and you need to respond to it. You are also given a generated content. You need to respond to the message in continuation of the generated content. Do not repeat the same content. Your response should be in continuation of this text: ",
+                }
+            )
+            new_messages.append(
+                {**{k: v for k, v in message.items() if k != "prefix"}}  # type: ignore
+            )
+        else:
+            new_messages.append(message)
+    return new_messages
diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py
@@ -940,8 +940,8 @@ def _optional_combine_thinking_block_in_choices(
                 and not self.sent_last_thinking_block
                 and model_response.choices[0].delta.content
             ):
-                model_response.choices[0].delta.content = (
-                    "</think>" + (model_response.choices[0].delta.content or "")
+                model_response.choices[0].delta.content = "</think>" + (
+                    model_response.choices[0].delta.content or ""
                 )
                 self.sent_last_thinking_block = True
 
@@ -1841,13 +1841,25 @@ async def __anext__(self):  # noqa: PLR0915
                     self.logging_obj.async_failure_handler(e, traceback_exception)  # type: ignore
                 )
             ## Map to OpenAI Exception
-            raise exception_type(
-                model=self.model,
-                custom_llm_provider=self.custom_llm_provider,
-                original_exception=e,
-                completion_kwargs={},
-                extra_kwargs={},
-            )
+            try:
+                exception_type(
+                    model=self.model,
+                    custom_llm_provider=self.custom_llm_provider,
+                    original_exception=e,
+                    completion_kwargs={},
+                    extra_kwargs={},
+                )
+            except Exception as e:
+                from litellm.exceptions import MidStreamFallbackError
+
+                raise MidStreamFallbackError(
+                    message=str(e),
+                    model=self.model,
+                    llm_provider=self.custom_llm_provider or "anthropic",
+                    original_exception=e,
+                    generated_content=self.response_uptil_now,
+                    is_pre_first_chunk=not self.sent_first_chunk,
+                )
 
     @staticmethod
     def _strip_sse_data_from_chunk(chunk: Optional[str]) -> Optional[str]:

diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -5,4 +5,4 @@ model_list:
       api_key: os.environ/OPENAI_API_KEY
 
 router_settings:
-  model_group_alias: {"gpt-4o": "gpt-4o-mini-openai"}
+  model_group_alias: {"gpt-4o": "gpt-4o-mini-openai"}