diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index abae36bf55..6772520bd5 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -316,6 +316,18 @@ def __init__( # noqa: PLR0915 labelnames=self.get_labels_for_metric("litellm_deployment_state"), ) + self.litellm_deployment_tpm_limit = self._gauge_factory( + "litellm_deployment_tpm_limit", + "Deployment TPM limit found in config", + labelnames=self.get_labels_for_metric("litellm_deployment_tpm_limit"), + ) + + self.litellm_deployment_rpm_limit = self._gauge_factory( + "litellm_deployment_rpm_limit", + "Deployment RPM limit found in config", + labelnames=self.get_labels_for_metric("litellm_deployment_rpm_limit"), + ) + self.litellm_deployment_cooled_down = self._counter_factory( "litellm_deployment_cooled_down", "LLM Deployment Analytics - Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down", @@ -1778,6 +1790,49 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): ) ) + def _set_deployment_tpm_rpm_limit_metrics( + self, + model_info: dict, + litellm_params: dict, + litellm_model_name: Optional[str], + model_id: Optional[str], + api_base: Optional[str], + llm_provider: Optional[str], + ): + """ + Set the deployment TPM and RPM limits metrics + """ + tpm = model_info.get("tpm") or litellm_params.get("tpm") + rpm = model_info.get("rpm") or litellm_params.get("rpm") + + if tpm is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_deployment_tpm_limit" + ), + enum_values=UserAPIKeyLabelValues( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ), + ) + self.litellm_deployment_tpm_limit.labels(**_labels).set(tpm) + + if rpm is not None: + _labels = prometheus_label_factory( + supported_enum_labels=self.get_labels_for_metric( + metric_name="litellm_deployment_rpm_limit" + ), + enum_values=UserAPIKeyLabelValues( + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + api_provider=llm_provider, + ), + ) + self.litellm_deployment_rpm_limit.labels(**_labels).set(rpm) + def set_llm_deployment_success_metrics( self, request_kwargs: dict, @@ -1811,6 +1866,16 @@ def set_llm_deployment_success_metrics( _model_info = _metadata.get("model_info") or {} model_id = _model_info.get("id", None) + if _model_info or _litellm_params: + self._set_deployment_tpm_rpm_limit_metrics( + model_info=_model_info, + litellm_params=_litellm_params, + litellm_model_name=litellm_model_name, + model_id=model_id, + api_base=api_base, + llm_provider=llm_provider, + ) + remaining_requests: Optional[int] = None remaining_tokens: Optional[int] = None if additional_headers := standard_logging_payload["hidden_params"][ diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py index e290101f8b..060e98fd49 100644 --- a/litellm/litellm_core_utils/get_litellm_params.py +++ b/litellm/litellm_core_utils/get_litellm_params.py @@ -93,8 +93,11 @@ def get_litellm_params( "text_completion": text_completion, "azure_ad_token_provider": azure_ad_token_provider, "user_continue_message": user_continue_message, - "base_model": base_model or ( - _get_base_model_from_litellm_call_metadata(metadata=metadata) if metadata else None + "base_model": base_model + or ( + _get_base_model_from_litellm_call_metadata(metadata=metadata) + if metadata + else None ), "litellm_trace_id": litellm_trace_id, "litellm_session_id": litellm_session_id, @@ -139,5 +142,7 @@ def get_litellm_params( "aws_sts_endpoint": kwargs.get("aws_sts_endpoint"), "aws_external_id": kwargs.get("aws_external_id"), "aws_bedrock_runtime_endpoint": kwargs.get("aws_bedrock_runtime_endpoint"), + "tpm": kwargs.get("tpm"), + "rpm": kwargs.get("rpm"), } return litellm_params diff --git a/litellm/main.py b/litellm/main.py index ce84c8988e..a4bcfdec81 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -148,7 +148,7 @@ validate_and_fix_openai_messages, validate_and_fix_openai_tools, validate_chat_completion_tool_choice, - validate_openai_optional_params + validate_openai_optional_params, ) from ._logging import verbose_logger @@ -368,7 +368,7 @@ async def create(self, messages, model=None, **kwargs): @tracer.wrap() @client -async def acompletion( # noqa: PLR0915 +async def acompletion( # noqa: PLR0915 model: str, # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create messages: List = [], @@ -603,12 +603,11 @@ async def acompletion( # noqa: PLR0915 if timeout is not None and isinstance(timeout, (int, float)): timeout_value = float(timeout) init_response = await asyncio.wait_for( - loop.run_in_executor(None, func_with_context), - timeout=timeout_value + loop.run_in_executor(None, func_with_context), timeout=timeout_value ) else: init_response = await loop.run_in_executor(None, func_with_context) - + if isinstance(init_response, dict) or isinstance( init_response, ModelResponse ): ## CACHING SCENARIO @@ -640,6 +639,7 @@ async def acompletion( # noqa: PLR0915 except asyncio.TimeoutError: custom_llm_provider = custom_llm_provider or "openai" from litellm.exceptions import Timeout + raise Timeout( message=f"Request timed out after {timeout} seconds", model=model, @@ -1118,7 +1118,6 @@ def completion( # type: ignore # noqa: PLR0915 # validate optional params stop = validate_openai_optional_params(stop=stop) - ######### unpacking kwargs ##################### args = locals() @@ -1135,7 +1134,9 @@ def completion( # type: ignore # noqa: PLR0915 # Check if MCP tools are present (following responses pattern) # Cast tools to Optional[Iterable[ToolParam]] for type checking tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools) - if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp): + if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway( + tools=tools_for_mcp + ): # Return coroutine - acompletion will await it # completion() can return a coroutine when MCP tools are present, which acompletion() awaits return acompletion_with_mcp( # type: ignore[return-value] @@ -1536,6 +1537,8 @@ def completion( # type: ignore # noqa: PLR0915 max_retries=max_retries, timeout=timeout, litellm_request_debug=kwargs.get("litellm_request_debug", False), + tpm=kwargs.get("tpm"), + rpm=kwargs.get("rpm"), ) cast(LiteLLMLoggingObj, logging).update_environment_variables( model=model, @@ -2361,11 +2364,7 @@ def completion( # type: ignore # noqa: PLR0915 input=messages, api_key=api_key, original_response=response ) elif custom_llm_provider == "minimax": - api_key = ( - api_key - or get_secret_str("MINIMAX_API_KEY") - or litellm.api_key - ) + api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key api_base = ( api_base @@ -2413,7 +2412,9 @@ def completion( # type: ignore # noqa: PLR0915 or custom_llm_provider == "wandb" or custom_llm_provider == "clarifai" or custom_llm_provider in litellm.openai_compatible_providers - or JSONProviderRegistry.exists(custom_llm_provider) # JSON-configured providers + or JSONProviderRegistry.exists( + custom_llm_provider + ) # JSON-configured providers or "ft:gpt-3.5-turbo" in model # finetune gpt-3.5-turbo ): # allow user to make an openai call with a custom base # note: if a user sets a custom base - we should ensure this works @@ -4724,7 +4725,7 @@ def embedding( # noqa: PLR0915 if headers is not None and headers != {}: optional_params["extra_headers"] = headers - + if encoding_format is not None: optional_params["encoding_format"] = encoding_format else: @@ -6759,9 +6760,7 @@ def speech( # noqa: PLR0915 if text_to_speech_provider_config is None: text_to_speech_provider_config = MinimaxTextToSpeechConfig() - minimax_config = cast( - MinimaxTextToSpeechConfig, text_to_speech_provider_config - ) + minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config) if api_base is not None: litellm_params_dict["api_base"] = api_base @@ -6901,7 +6900,7 @@ async def ahealth_check( custom_llm_provider_from_params = model_params.get("custom_llm_provider", None) api_base_from_params = model_params.get("api_base", None) api_key_from_params = model_params.get("api_key", None) - + model, custom_llm_provider, _, _ = get_llm_provider( model=model, custom_llm_provider=custom_llm_provider_from_params, @@ -7275,8 +7274,9 @@ def __getattr__(name: str) -> Any: _encoding = tiktoken.get_encoding("cl100k_base") # Cache it in the module's __dict__ for subsequent accesses import sys + sys.modules[__name__].__dict__["encoding"] = _encoding global _encoding_cache _encoding_cache = _encoding return _encoding - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") \ No newline at end of file + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py index ea9c9bd325..ee49ba1a19 100644 --- a/litellm/types/integrations/prometheus.py +++ b/litellm/types/integrations/prometheus.py @@ -199,6 +199,8 @@ class UserAPIKeyLabelNames(Enum): "litellm_cache_hits_metric", "litellm_cache_misses_metric", "litellm_cached_tokens_metric", + "litellm_deployment_tpm_limit", + "litellm_deployment_rpm_limit", "litellm_remaining_api_key_requests_for_model", "litellm_remaining_api_key_tokens_for_model", "litellm_llm_api_failed_requests_metric", @@ -406,6 +408,15 @@ class PrometheusMetricLabels: UserAPIKeyLabelNames.API_PROVIDER.value, ] + litellm_deployment_tpm_limit = [ + UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value, + UserAPIKeyLabelNames.MODEL_ID.value, + UserAPIKeyLabelNames.API_BASE.value, + UserAPIKeyLabelNames.API_PROVIDER.value, + ] + + litellm_deployment_rpm_limit = litellm_deployment_tpm_limit + litellm_deployment_cooled_down = [ UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value, UserAPIKeyLabelNames.MODEL_ID.value,