Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions litellm/integrations/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ def __init__( # noqa: PLR0915
labelnames=self.get_labels_for_metric("litellm_deployment_state"),
)

self.litellm_deployment_tpm_limit = self._gauge_factory(
"litellm_deployment_tpm_limit",
"Deployment TPM limit found in config",
labelnames=self.get_labels_for_metric("litellm_deployment_tpm_limit"),
)

self.litellm_deployment_rpm_limit = self._gauge_factory(
"litellm_deployment_rpm_limit",
"Deployment RPM limit found in config",
labelnames=self.get_labels_for_metric("litellm_deployment_rpm_limit"),
)

self.litellm_deployment_cooled_down = self._counter_factory(
"litellm_deployment_cooled_down",
"LLM Deployment Analytics - Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down",
Expand Down Expand Up @@ -1778,6 +1790,49 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
)
)

def _set_deployment_tpm_rpm_limit_metrics(
self,
model_info: dict,
litellm_params: dict,
litellm_model_name: Optional[str],
model_id: Optional[str],
api_base: Optional[str],
llm_provider: Optional[str],
):
"""
Set the deployment TPM and RPM limits metrics
"""
tpm = model_info.get("tpm") or litellm_params.get("tpm")
rpm = model_info.get("rpm") or litellm_params.get("rpm")

if tpm is not None:
_labels = prometheus_label_factory(
supported_enum_labels=self.get_labels_for_metric(
metric_name="litellm_deployment_tpm_limit"
),
enum_values=UserAPIKeyLabelValues(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
api_provider=llm_provider,
),
)
self.litellm_deployment_tpm_limit.labels(**_labels).set(tpm)

if rpm is not None:
_labels = prometheus_label_factory(
supported_enum_labels=self.get_labels_for_metric(
metric_name="litellm_deployment_rpm_limit"
),
enum_values=UserAPIKeyLabelValues(
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
api_provider=llm_provider,
),
)
self.litellm_deployment_rpm_limit.labels(**_labels).set(rpm)

def set_llm_deployment_success_metrics(
self,
request_kwargs: dict,
Expand Down Expand Up @@ -1811,6 +1866,16 @@ def set_llm_deployment_success_metrics(
_model_info = _metadata.get("model_info") or {}
model_id = _model_info.get("id", None)

if _model_info or _litellm_params:
self._set_deployment_tpm_rpm_limit_metrics(
model_info=_model_info,
litellm_params=_litellm_params,
litellm_model_name=litellm_model_name,
model_id=model_id,
api_base=api_base,
llm_provider=llm_provider,
)

remaining_requests: Optional[int] = None
remaining_tokens: Optional[int] = None
if additional_headers := standard_logging_payload["hidden_params"][
Expand Down
9 changes: 7 additions & 2 deletions litellm/litellm_core_utils/get_litellm_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,11 @@ def get_litellm_params(
"text_completion": text_completion,
"azure_ad_token_provider": azure_ad_token_provider,
"user_continue_message": user_continue_message,
"base_model": base_model or (
_get_base_model_from_litellm_call_metadata(metadata=metadata) if metadata else None
"base_model": base_model
or (
_get_base_model_from_litellm_call_metadata(metadata=metadata)
if metadata
else None
),
"litellm_trace_id": litellm_trace_id,
"litellm_session_id": litellm_session_id,
Expand Down Expand Up @@ -139,5 +142,7 @@ def get_litellm_params(
"aws_sts_endpoint": kwargs.get("aws_sts_endpoint"),
"aws_external_id": kwargs.get("aws_external_id"),
"aws_bedrock_runtime_endpoint": kwargs.get("aws_bedrock_runtime_endpoint"),
"tpm": kwargs.get("tpm"),
"rpm": kwargs.get("rpm"),
}
return litellm_params
38 changes: 19 additions & 19 deletions litellm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@
validate_and_fix_openai_messages,
validate_and_fix_openai_tools,
validate_chat_completion_tool_choice,
validate_openai_optional_params
validate_openai_optional_params,
)

from ._logging import verbose_logger
Expand Down Expand Up @@ -368,7 +368,7 @@ async def create(self, messages, model=None, **kwargs):

@tracer.wrap()
@client
async def acompletion( # noqa: PLR0915
async def acompletion( # noqa: PLR0915
model: str,
# Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
messages: List = [],
Expand Down Expand Up @@ -603,12 +603,11 @@ async def acompletion( # noqa: PLR0915
if timeout is not None and isinstance(timeout, (int, float)):
timeout_value = float(timeout)
init_response = await asyncio.wait_for(
loop.run_in_executor(None, func_with_context),
timeout=timeout_value
loop.run_in_executor(None, func_with_context), timeout=timeout_value
)
else:
init_response = await loop.run_in_executor(None, func_with_context)

if isinstance(init_response, dict) or isinstance(
init_response, ModelResponse
): ## CACHING SCENARIO
Expand Down Expand Up @@ -640,6 +639,7 @@ async def acompletion( # noqa: PLR0915
except asyncio.TimeoutError:
custom_llm_provider = custom_llm_provider or "openai"
from litellm.exceptions import Timeout

raise Timeout(
message=f"Request timed out after {timeout} seconds",
model=model,
Expand Down Expand Up @@ -1118,7 +1118,6 @@ def completion( # type: ignore # noqa: PLR0915
# validate optional params
stop = validate_openai_optional_params(stop=stop)


######### unpacking kwargs #####################
args = locals()

Expand All @@ -1135,7 +1134,9 @@ def completion( # type: ignore # noqa: PLR0915
# Check if MCP tools are present (following responses pattern)
# Cast tools to Optional[Iterable[ToolParam]] for type checking
tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools)
if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp):
if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(
tools=tools_for_mcp
):
# Return coroutine - acompletion will await it
# completion() can return a coroutine when MCP tools are present, which acompletion() awaits
return acompletion_with_mcp( # type: ignore[return-value]
Expand Down Expand Up @@ -1536,6 +1537,8 @@ def completion( # type: ignore # noqa: PLR0915
max_retries=max_retries,
timeout=timeout,
litellm_request_debug=kwargs.get("litellm_request_debug", False),
tpm=kwargs.get("tpm"),
rpm=kwargs.get("rpm"),
)
cast(LiteLLMLoggingObj, logging).update_environment_variables(
model=model,
Expand Down Expand Up @@ -2361,11 +2364,7 @@ def completion( # type: ignore # noqa: PLR0915
input=messages, api_key=api_key, original_response=response
)
elif custom_llm_provider == "minimax":
api_key = (
api_key
or get_secret_str("MINIMAX_API_KEY")
or litellm.api_key
)
api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key

api_base = (
api_base
Expand Down Expand Up @@ -2413,7 +2412,9 @@ def completion( # type: ignore # noqa: PLR0915
or custom_llm_provider == "wandb"
or custom_llm_provider == "clarifai"
or custom_llm_provider in litellm.openai_compatible_providers
or JSONProviderRegistry.exists(custom_llm_provider) # JSON-configured providers
or JSONProviderRegistry.exists(
custom_llm_provider
) # JSON-configured providers
or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo
): # allow user to make an openai call with a custom base
# note: if a user sets a custom base - we should ensure this works
Expand Down Expand Up @@ -4724,7 +4725,7 @@ def embedding( # noqa: PLR0915

if headers is not None and headers != {}:
optional_params["extra_headers"] = headers

if encoding_format is not None:
optional_params["encoding_format"] = encoding_format
else:
Expand Down Expand Up @@ -6759,9 +6760,7 @@ def speech( # noqa: PLR0915
if text_to_speech_provider_config is None:
text_to_speech_provider_config = MinimaxTextToSpeechConfig()

minimax_config = cast(
MinimaxTextToSpeechConfig, text_to_speech_provider_config
)
minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config)

if api_base is not None:
litellm_params_dict["api_base"] = api_base
Expand Down Expand Up @@ -6901,7 +6900,7 @@ async def ahealth_check(
custom_llm_provider_from_params = model_params.get("custom_llm_provider", None)
api_base_from_params = model_params.get("api_base", None)
api_key_from_params = model_params.get("api_key", None)

model, custom_llm_provider, _, _ = get_llm_provider(
model=model,
custom_llm_provider=custom_llm_provider_from_params,
Expand Down Expand Up @@ -7275,8 +7274,9 @@ def __getattr__(name: str) -> Any:
_encoding = tiktoken.get_encoding("cl100k_base")
# Cache it in the module's __dict__ for subsequent accesses
import sys

sys.modules[__name__].__dict__["encoding"] = _encoding
global _encoding_cache
_encoding_cache = _encoding
return _encoding
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
11 changes: 11 additions & 0 deletions litellm/types/integrations/prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ class UserAPIKeyLabelNames(Enum):
"litellm_cache_hits_metric",
"litellm_cache_misses_metric",
"litellm_cached_tokens_metric",
"litellm_deployment_tpm_limit",
"litellm_deployment_rpm_limit",
"litellm_remaining_api_key_requests_for_model",
"litellm_remaining_api_key_tokens_for_model",
"litellm_llm_api_failed_requests_metric",
Expand Down Expand Up @@ -406,6 +408,15 @@ class PrometheusMetricLabels:
UserAPIKeyLabelNames.API_PROVIDER.value,
]

litellm_deployment_tpm_limit = [
UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
UserAPIKeyLabelNames.MODEL_ID.value,
UserAPIKeyLabelNames.API_BASE.value,
UserAPIKeyLabelNames.API_PROVIDER.value,
]

litellm_deployment_rpm_limit = litellm_deployment_tpm_limit

litellm_deployment_cooled_down = [
UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
UserAPIKeyLabelNames.MODEL_ID.value,
Expand Down
Loading