Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions docs/my-website/docs/proxy/call_hooks.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import Image from '@theme/IdealImage';
| `async_post_call_success_hook` | Modify outgoing response (non-streaming) | After successful LLM API call, for non-streaming responses |
| `async_post_call_failure_hook` | Transform error responses sent to clients | After failed LLM API call |
| `async_post_call_streaming_hook` | Modify outgoing response (streaming) | After successful LLM API call, for streaming responses |
| `async_post_call_response_headers_hook` | Inject custom HTTP response headers | After LLM API call (both success and failure) |

See a complete example with our [parallel request rate limiter](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/parallel_request_limiter.py)

Expand Down Expand Up @@ -115,6 +116,18 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit
async for item in response:
yield item

async def async_post_call_response_headers_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_headers: Optional[Dict[str, str]] = None,
) -> Optional[Dict[str, str]]:
"""
Inject custom headers into HTTP response (runs for both success and failure).
"""
return {"x-custom-header": "custom-value"}

proxy_handler_instance = MyCustomHandler()
```

Expand Down Expand Up @@ -389,3 +402,31 @@ proxy_handler_instance = MyErrorTransformer()
```

**Result:** Clients receive `"Your prompt is too long..."` instead of `"ContextWindowExceededError: Prompt exceeds context window"`.

## Advanced - Inject Custom HTTP Response Headers

Use `async_post_call_response_headers_hook` to inject custom HTTP headers into responses. This hook runs for **both successful and failed** LLM API calls.

```python
from litellm.integrations.custom_logger import CustomLogger
from litellm.proxy.proxy_server import UserAPIKeyAuth
from typing import Any, Dict, Optional

class CustomHeaderLogger(CustomLogger):
def __init__(self):
super().__init__()

async def async_post_call_response_headers_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_headers: Optional[Dict[str, str]] = None,
) -> Optional[Dict[str, str]]:
"""
Inject custom headers into all responses (success and failure).
"""
return {"x-custom-header": "custom-value"}

proxy_handler_instance = CustomHeaderLogger()
```
22 changes: 22 additions & 0 deletions litellm/integrations/custom_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,28 @@ async def async_pre_call_hook(
]: # raise exception if invalid, return a str for the user to receive - if rejected, or return a modified dictionary for passing into litellm
pass

async def async_post_call_response_headers_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_headers: Optional[Dict[str, str]] = None,
) -> Optional[Dict[str, str]]:
"""
Called after an LLM API call (success or failure) to allow injecting custom HTTP response headers.

Args:
- data: dict - The request data.
- user_api_key_dict: UserAPIKeyAuth - The user API key dictionary.
- response: Any - The response object (None for failure cases).
- request_headers: Optional[Dict[str, str]] - The original request headers.

Returns:
- Optional[Dict[str, str]]: A dictionary of headers to inject into the HTTP response.
Return None to not inject any headers.
"""
return None

async def async_post_call_failure_hook(
self,
request_data: dict,
Expand Down
24 changes: 19 additions & 5 deletions litellm/model_prices_and_context_window_backup.json
Original file line number Diff line number Diff line change
Expand Up @@ -3726,9 +3726,9 @@
"cache_read_input_token_cost": 1.75e-07,
"input_cost_per_token": 1.75e-06,
"litellm_provider": "azure",
"max_input_tokens": 128000,
"max_output_tokens": 16384,
"max_tokens": 16384,
"max_input_tokens": 272000,
"max_output_tokens": 128000,
"max_tokens": 128000,
"mode": "responses",
"output_cost_per_token": 1.4e-05,
"supported_endpoints": [
Expand Down Expand Up @@ -18799,7 +18799,7 @@
"input_cost_per_token": 1.75e-06,
"input_cost_per_token_priority": 3.5e-06,
"litellm_provider": "openai",
"max_input_tokens": 400000,
"max_input_tokens": 272000,
"max_output_tokens": 128000,
"max_tokens": 128000,
"mode": "responses",
Expand Down Expand Up @@ -23790,6 +23790,20 @@
"output_cost_per_token": 6.5e-07,
"supports_tool_choice": true
},
"openrouter/moonshotai/kimi-k2.5": {
"cache_read_input_token_cost": 1e-07,
"input_cost_per_token": 6e-07,
"litellm_provider": "openrouter",
"max_input_tokens": 262144,
"max_output_tokens": 262144,
"max_tokens": 262144,
"mode": "chat",
"output_cost_per_token": 3e-06,
"source": "https://openrouter.ai/moonshotai/kimi-k2.5",
"supports_function_calling": true,
"supports_tool_choice": true,
"supports_vision": true
},
"openrouter/nousresearch/nous-hermes-llama2-13b": {
"input_cost_per_token": 2e-07,
"litellm_provider": "openrouter",
Expand Down Expand Up @@ -24003,7 +24017,7 @@
"cache_read_input_token_cost": 1.75e-07,
"input_cost_per_token": 1.75e-06,
"litellm_provider": "openrouter",
"max_input_tokens": 400000,
"max_input_tokens": 272000,
"max_output_tokens": 128000,
"max_tokens": 128000,
"mode": "responses",
Expand Down
31 changes: 31 additions & 0 deletions litellm/proxy/common_request_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,15 @@ async def base_process_llm_request(
**additional_headers,
)

# Call response headers hook for streaming success
callback_headers = await proxy_logging_obj.post_call_response_headers_hook(
data=self.data,
user_api_key_dict=user_api_key_dict,
response=response,
)
if callback_headers:
custom_headers.update(callback_headers)

# Preserve the original client-requested model (pre-alias mapping) for downstream
# streaming generators. Pre-call processing can rewrite `self.data["model"]` for
# aliasing/routing, but the OpenAI-compatible response `model` field should reflect
Expand Down Expand Up @@ -900,6 +909,16 @@ async def base_process_llm_request(
**additional_headers,
)
)

# Call response headers hook for non-streaming success
callback_headers = await proxy_logging_obj.post_call_response_headers_hook(
data=self.data,
user_api_key_dict=user_api_key_dict,
response=response,
)
if callback_headers:
fastapi_response.headers.update(callback_headers)

await check_response_size_is_safe(response=response)

return response
Expand Down Expand Up @@ -1058,6 +1077,18 @@ async def _handle_llm_api_exception(
headers = get_response_headers(dict(_response_headers))
headers.update(custom_headers)

# Call response headers hook for failure
try:
callback_headers = await proxy_logging_obj.post_call_response_headers_hook(
data=self.data,
user_api_key_dict=user_api_key_dict,
response=None,
)
if callback_headers:
headers.update(callback_headers)
except Exception:
pass

if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "detail", str(e)),
Expand Down
40 changes: 40 additions & 0 deletions litellm/proxy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1808,6 +1808,46 @@ async def post_call_success_hook(
raise e
return response

async def post_call_response_headers_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
response: Any,
request_headers: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""
Calls async_post_call_response_headers_hook on all CustomLogger callbacks.
Merges all returned header dicts (later callbacks override earlier ones).

Returns:
Dict[str, str]: Merged headers from all callbacks.
"""
merged_headers: Dict[str, str] = {}
try:
for callback in litellm.callbacks:
_callback: Optional[CustomLogger] = None
if isinstance(callback, str):
_callback = litellm.litellm_core_utils.litellm_logging.get_custom_logger_compatible_class(
cast(_custom_logger_compatible_callbacks_literal, callback)
)
else:
_callback = callback # type: ignore

if _callback is not None and isinstance(_callback, CustomLogger):
result = await _callback.async_post_call_response_headers_hook(
data=data,
user_api_key_dict=user_api_key_dict,
response=response,
request_headers=request_headers,
)
if result is not None:
merged_headers.update(result)
except Exception as e:
verbose_proxy_logger.exception(
"Error in post_call_response_headers_hook: %s", str(e)
)
return merged_headers

async def async_post_call_streaming_hook(
self,
data: dict,
Expand Down
Loading
Loading