Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions litellm/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,3 +829,65 @@ def __init__(
self.guardrail_name = guardrail_name
self.message = f"Blocked entity detected: {entity_type} by Guardrail: {guardrail_name}. This entity is not allowed to be used in this request."
super().__init__(self.message)


class MidStreamFallbackError(ServiceUnavailableError): # type: ignore
def __init__(
self,
message: str,
model: str,
llm_provider: str,
original_exception: Optional[Exception] = None,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
generated_content: str = "",
is_pre_first_chunk: bool = False,
):
self.status_code = 503 # Service Unavailable
self.message = f"litellm.MidStreamFallbackError: {message}"
self.model = model
self.llm_provider = llm_provider
self.original_exception = original_exception
self.litellm_debug_info = litellm_debug_info
self.max_retries = max_retries
self.num_retries = num_retries
self.generated_content = generated_content
self.is_pre_first_chunk = is_pre_first_chunk

# Create a response if one wasn't provided
if response is None:
self.response = httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="POST",
url=f"https://{llm_provider}.com/v1/",
),
)
else:
self.response = response

# Call the parent constructor
super().__init__(
message=self.message,
llm_provider=llm_provider,
model=model,
response=self.response,
litellm_debug_info=self.litellm_debug_info,
max_retries=self.max_retries,
num_retries=self.num_retries,
)

def __str__(self):
_message = self.message
if self.num_retries:
_message += f" LiteLLM Retried: {self.num_retries} times"
if self.max_retries:
_message += f", LiteLLM Max Retries: {self.max_retries}"
if self.original_exception:
_message += f" Original exception: {type(self.original_exception).__name__}: {str(self.original_exception)}"
return _message

def __repr__(self):
return self.__str__()
38 changes: 38 additions & 0 deletions litellm/litellm_core_utils/prompt_templates/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,3 +822,41 @@ def set_last_user_message(
messages.reverse()
messages.append({"role": "user", "content": content})
return messages


def convert_prefix_message_to_non_prefix_messages(
messages: List[AllMessageValues],
) -> List[AllMessageValues]:
"""
For models that don't support {prefix: true} in messages, we need to convert the prefix message to a non-prefix message.

Use prompt:

{"role": "assistant", "content": "value", "prefix": true} -> [
{
"role": "system",
"content": "You are a helpful assistant. You are given a message and you need to respond to it. You are also given a generated content. You need to respond to the message in continuation of the generated content. Do not repeat the same content. Your response should be in continuation of this text: ",
},
{
"role": "assistant",
"content": message["content"],
},
]

do this in place
"""
new_messages: List[AllMessageValues] = []
for message in messages:
if message.get("prefix"):
new_messages.append(
{
"role": "system",
"content": "You are a helpful assistant. You are given a message and you need to respond to it. You are also given a generated content. You need to respond to the message in continuation of the generated content. Do not repeat the same content. Your response should be in continuation of this text: ",
}
)
new_messages.append(
{**{k: v for k, v in message.items() if k != "prefix"}} # type: ignore
)
else:
new_messages.append(message)
return new_messages
30 changes: 21 additions & 9 deletions litellm/litellm_core_utils/streaming_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,8 @@ def _optional_combine_thinking_block_in_choices(
and not self.sent_last_thinking_block
and model_response.choices[0].delta.content
):
model_response.choices[0].delta.content = (
"</think>" + (model_response.choices[0].delta.content or "")
model_response.choices[0].delta.content = "</think>" + (
model_response.choices[0].delta.content or ""
)
self.sent_last_thinking_block = True

Expand Down Expand Up @@ -1841,13 +1841,25 @@ async def __anext__(self): # noqa: PLR0915
self.logging_obj.async_failure_handler(e, traceback_exception) # type: ignore
)
## Map to OpenAI Exception
raise exception_type(
model=self.model,
custom_llm_provider=self.custom_llm_provider,
original_exception=e,
completion_kwargs={},
extra_kwargs={},
)
try:
exception_type(
model=self.model,
custom_llm_provider=self.custom_llm_provider,
original_exception=e,
completion_kwargs={},
extra_kwargs={},
)
except Exception as e:
from litellm.exceptions import MidStreamFallbackError

raise MidStreamFallbackError(
message=str(e),
model=self.model,
llm_provider=self.custom_llm_provider or "anthropic",
original_exception=e,
generated_content=self.response_uptil_now,
is_pre_first_chunk=not self.sent_first_chunk,
)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Exception Handling Fails to Map Errors

The exception handling logic is incorrect. The exception_type() call, which previously raised a mapped exception, is now merely called within a try block without being raised. This prevents the intended exception mapping and can mask the original error. Additionally, the inner except Exception as e: shadows the original exception variable, causing MidStreamFallbackError to be instantiated with the wrong original_exception and message (referring to the inner exception instead of the original one).

Locations (1)
Fix in Cursor Fix in Web

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Exception Handling Refactor Causes Errors

The refactored exception handling introduces two issues:

  1. The exception_type() function is called, but its returned exception object is not raised. This can lead to the original exception being swallowed if exception_type() returns an exception instead of raising one.
  2. The inner except Exception as e: clause shadows the outer e variable, causing MidStreamFallbackError to incorrectly reference the exception from the exception_type() call (if it failed) as original_exception instead of the true original exception, obscuring the root cause.
Locations (1)
Fix in Cursor Fix in Web


@staticmethod
def _strip_sse_data_from_chunk(chunk: Optional[str]) -> Optional[str]:
Expand Down
2 changes: 1 addition & 1 deletion litellm/proxy/_new_secret_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ model_list:
api_key: os.environ/OPENAI_API_KEY

router_settings:
model_group_alias: {"gpt-4o": "gpt-4o-mini-openai"}
model_group_alias: {"gpt-4o": "gpt-4o-mini-openai"}
Loading
Loading