diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py index ec4553fac4f..3ef0186ba0e 100644 --- a/litellm/llms/azure/azure.py +++ b/litellm/llms/azure/azure.py @@ -664,8 +664,29 @@ async def aembedding( **data, timeout=timeout ) headers = dict(raw_response.headers) - response = raw_response.parse() + + # Convert json.JSONDecodeError to AzureOpenAIError for two critical reasons: + # + # 1. ROUTER BEHAVIOR: The router relies on exception.status_code to determine cooldown logic: + # - JSONDecodeError has no status_code → router skips cooldown evaluation + # - AzureOpenAIError has status_code → router properly evaluates for cooldown + # + # 2. CONNECTION CLEANUP: When response.parse() throws JSONDecodeError, the response + # body may not be fully consumed, preventing httpx from properly returning the + # connection to the pool. By catching the exception and accessing raw_response.status_code, + # we trigger httpx's internal cleanup logic. Without this: + # - parse() fails → JSONDecodeError bubbles up → httpx never knows response was acknowledged → connection leak + # This completely eliminates "Unclosed connection" warnings during high load. + try: + response = raw_response.parse() + except json.JSONDecodeError as json_error: + raise AzureOpenAIError( + status_code=raw_response.status_code or 500, + message=f"Failed to parse raw Azure embedding response: {str(json_error)}" + ) from json_error + stringified_response = response.model_dump() + ## LOGGING logging_obj.post_call( input=input,