Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions docs/my-website/docs/providers/aws_sagemaker.md
Original file line number Diff line number Diff line change
Expand Up @@ -526,3 +526,98 @@ print(f"response: {response}")
```



## Nova Models on SageMaker

LiteLLM supports Amazon Nova models (Nova Micro, Nova Lite, Nova 2 Lite) deployed on SageMaker Inference real-time endpoints. These custom/fine-tuned Nova models use an OpenAI-compatible API format.

**Reference:** [AWS Blog - Amazon SageMaker Inference for Custom Amazon Nova Models](https://aws.amazon.com/blogs/aws/announcing-amazon-sagemaker-inference-for-custom-amazon-nova-models/)

### Usage

Use the `sagemaker_nova/` prefix with your SageMaker endpoint name:

```python
import litellm
import os

os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = "us-east-1"

# Basic chat completion
response = litellm.completion(
model="sagemaker_nova/my-nova-endpoint",
messages=[{"role": "user", "content": "Hello, how are you?"}],
temperature=0.7,
max_tokens=512,
)
print(response.choices[0].message.content)
```

### Streaming

```python
response = litellm.completion(
model="sagemaker_nova/my-nova-endpoint",
messages=[{"role": "user", "content": "Write a short poem"}],
stream=True,
stream_options={"include_usage": True},
)
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="")
```

### Multimodal (Images)

Nova models on SageMaker support image inputs using base64 data URIs:

```python
response = litellm.completion(
model="sagemaker_nova/my-nova-endpoint",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,..."}}
]
}
],
)
```

### Proxy Config

```yaml
model_list:
- model_name: nova-micro
litellm_params:
model: sagemaker_nova/my-nova-micro-endpoint
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
aws_region_name: us-east-1
```

### Supported Parameters

All standard OpenAI parameters are supported, plus these Nova-specific parameters:

| Parameter | Type | Description |
|-----------|------|-------------|
| `top_k` | integer | Limits token selection to top K most likely tokens |
| `reasoning_effort` | `"low"` \| `"high"` | Reasoning effort level (Nova 2 Lite custom models only) |
| `allowed_token_ids` | array[int] | Restrict output to specified token IDs |
| `truncate_prompt_tokens` | integer | Truncate prompt to N tokens if it exceeds limit |

```python
response = litellm.completion(
model="sagemaker_nova/my-nova-endpoint",
messages=[{"role": "user", "content": "Think step by step: what is 2+2?"}],
top_k=40,
reasoning_effort="low",
logprobs=True,
top_logprobs=2,
)
```
9 changes: 3 additions & 6 deletions litellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1464,12 +1464,9 @@ def set_global_gitlab_config(config: Dict[str, Any]) -> None:
from .llms.petals.completion.transformation import PetalsConfig as PetalsConfig
from .llms.ollama.chat.transformation import OllamaChatConfig as OllamaChatConfig
from .llms.ollama.completion.transformation import OllamaConfig as OllamaConfig
from .llms.sagemaker.completion.transformation import (
SagemakerConfig as SagemakerConfig,
)
from .llms.sagemaker.chat.transformation import (
SagemakerChatConfig as SagemakerChatConfig,
)
from .llms.sagemaker.completion.transformation import SagemakerConfig as SagemakerConfig
from .llms.sagemaker.chat.transformation import SagemakerChatConfig as SagemakerChatConfig
from .llms.sagemaker.nova.transformation import SagemakerNovaConfig as SagemakerNovaConfig
from .llms.cohere.chat.transformation import CohereChatConfig as CohereChatConfig
from .llms.anthropic.experimental_pass_through.messages.transformation import (
AnthropicMessagesConfig as AnthropicMessagesConfig,
Expand Down
5 changes: 5 additions & 0 deletions litellm/_lazy_imports_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@
"OllamaConfig",
"SagemakerConfig",
"SagemakerChatConfig",
"SagemakerNovaConfig",
"CohereChatConfig",
"AnthropicMessagesConfig",
"AmazonAnthropicClaudeMessagesConfig",
Expand Down Expand Up @@ -701,6 +702,10 @@
".llms.sagemaker.chat.transformation",
"SagemakerChatConfig",
),
"SagemakerNovaConfig": (
".llms.sagemaker.nova.transformation",
"SagemakerNovaConfig",
),
"CohereChatConfig": (".llms.cohere.chat.transformation", "CohereChatConfig"),
"AnthropicMessagesConfig": (
".llms.anthropic.experimental_pass_through.messages.transformation",
Expand Down
1 change: 1 addition & 0 deletions litellm/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@
"azure_ai",
"sagemaker",
"sagemaker_chat",
"sagemaker_nova",
"bedrock",
"vllm",
"nlp_cloud",
Expand Down
10 changes: 7 additions & 3 deletions litellm/llms/sagemaker/chat/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def get_sync_custom_stream_wrapper(
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
model=model,
custom_llm_provider="sagemaker_chat",
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
)
return streaming_response
Expand All @@ -180,8 +180,12 @@ async def get_async_custom_stream_wrapper(
signed_json_body: Optional[bytes] = None,
) -> CustomStreamWrapper:
if client is None or isinstance(client, HTTPHandler):
try:
llm_provider = LlmProviders(custom_llm_provider)
except ValueError:
llm_provider = LlmProviders.SAGEMAKER_CHAT
client = get_async_httpx_client(
llm_provider=LlmProviders.SAGEMAKER_CHAT, params={}
llm_provider=llm_provider, params={}
)

try:
Expand Down Expand Up @@ -210,7 +214,7 @@ async def get_async_custom_stream_wrapper(
streaming_response = CustomStreamWrapper(
completion_stream=completion_stream,
model=model,
custom_llm_provider="sagemaker_chat",
custom_llm_provider=custom_llm_provider,
logging_obj=logging_obj,
)
return streaming_response
1 change: 1 addition & 0 deletions litellm/llms/sagemaker/nova/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .transformation import SagemakerNovaConfig # noqa: F401
70 changes: 70 additions & 0 deletions litellm/llms/sagemaker/nova/transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Translate from OpenAI's `/v1/chat/completions` to SageMaker Nova Inference endpoints.

Nova models on SageMaker use OpenAI-compatible request/response format with
additional Nova-specific parameters (top_k, reasoning_effort, etc.).

Docs: https://docs.aws.amazon.com/nova/latest/nova2-userguide/nova-sagemaker-inference-api-reference.html
"""

from typing import List

from litellm.types.llms.openai import AllMessageValues

from ..chat.transformation import SagemakerChatConfig


class SagemakerNovaConfig(SagemakerChatConfig):
"""
Config for Amazon Nova models deployed on SageMaker Inference endpoints.

Nova uses OpenAI-compatible format (same as sagemaker_chat / HF Messages API)
but with additional Nova-specific parameters and requires `stream: true` in
the request body for streaming.

Usage:
model="sagemaker_nova/<endpoint-name>"
"""

@property
def supports_stream_param_in_request_body(self) -> bool:
"""Nova expects `stream: true` in the request body for streaming."""
return True

def get_supported_openai_params(self, model: str) -> List:
"""Extend parent params with Nova-specific parameters."""
params = super().get_supported_openai_params(model)
nova_params = [
"top_k",
"reasoning_effort",
"allowed_token_ids",
"truncate_prompt_tokens",
]
for p in nova_params:
if p not in params:
Comment on lines +37 to +44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reasoning_effort hardcoded as supported for all Nova endpoints

Per the PR description and AWS docs, reasoning_effort is only supported by Nova 2 Lite custom models — not Nova Micro or Nova Lite. However, it is unconditionally added to get_supported_openai_params for every sagemaker_nova endpoint.

This violates the project's custom instruction (rule 2605a1b1): model-specific capability flags should live in model_prices_and_context_window.json and be read via get_model_info, so that support can be updated without a LiteLLM code release.

Because SageMaker endpoint names are opaque (just a string like "my-nova-endpoint"), there is no static entry in the pricing file for these models. The recommended pattern is to rely on a capability flag (supports_reasoning) checked via get_model_info, or to document that passing reasoning_effort to a non-Nova-2-Lite endpoint will result in an API error, rather than advertising it as universally supported.

At minimum, a docstring clarifying the model restriction would prevent confusion, but the preferred fix per project policy is to make this capability opt-in or gate it on a runtime check.

Rule Used: What: Do not hardcode model-specific flags in the ... (source)

params.append(p)
return params

def transform_request(
self,
model: str,
messages: List[AllMessageValues],
optional_params: dict,
litellm_params: dict,
headers: dict,
) -> dict:
"""
Nova SageMaker endpoints do not accept 'model' in the request body.
Only supported fields: messages, max_tokens, max_completion_tokens,
temperature, top_p, top_k, stream, stream_options, logprobs,
top_logprobs, reasoning_effort, allowed_token_ids, truncate_prompt_tokens.
"""
request_body = super().transform_request(
model=model,
messages=messages,
optional_params=optional_params,
litellm_params=litellm_params,
headers=headers,
)
request_body.pop("model", None)
return request_body
6 changes: 4 additions & 2 deletions litellm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3712,8 +3712,10 @@ def completion( # type: ignore # noqa: PLR0915
):
return _model_response
response = _model_response
elif custom_llm_provider == "sagemaker_chat":
elif custom_llm_provider in ("sagemaker_chat", "sagemaker_nova"):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sagemaker_nova not added to exception mapping

litellm/litellm_core_utils/exception_mapping_utils.py maps AWS-specific error strings (e.g. "Unable to locate credentials", context-window exceeded) to proper LiteLLM exception types, but its condition only covers "sagemaker" and "sagemaker_chat":

elif (
    custom_llm_provider == "sagemaker"
    or custom_llm_provider == "sagemaker_chat"
):

Because "sagemaker_nova" is missing, credential errors and context-window exceeded errors from a Nova endpoint will not be translated into BadRequestError / ContextWindowExceededError — they'll surface as raw, unmapped exceptions instead. The fix is to add sagemaker_nova to that condition (in exception_mapping_utils.py):

elif (
    custom_llm_provider == "sagemaker"
    or custom_llm_provider == "sagemaker_chat"
    or custom_llm_provider == "sagemaker_nova"
):

# boto3 reads keys from .env
# sagemaker_chat: HF Messages API endpoints
# sagemaker_nova: Nova models on SageMaker (OpenAI-compatible)
model_response = base_llm_http_handler.completion(
model=model,
stream=stream,
Expand All @@ -3723,7 +3725,7 @@ def completion( # type: ignore # noqa: PLR0915
model_response=model_response,
optional_params=optional_params,
litellm_params=litellm_params,
custom_llm_provider="sagemaker_chat",
custom_llm_provider=custom_llm_provider,
timeout=timeout,
headers=headers,
encoding=_get_encoding(),
Expand Down
1 change: 1 addition & 0 deletions litellm/types/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3131,6 +3131,7 @@ class LlmProviders(str, Enum):
AZURE_AI = "azure_ai"
SAGEMAKER = "sagemaker"
SAGEMAKER_CHAT = "sagemaker_chat"
SAGEMAKER_NOVA = "sagemaker_nova"
BEDROCK = "bedrock"
VLLM = "vllm"
NLP_CLOUD = "nlp_cloud"
Expand Down
1 change: 1 addition & 0 deletions litellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7945,6 +7945,7 @@ def _build_provider_config_map() -> dict[LlmProviders, tuple[Callable, bool]]:
LlmProviders.VERTEX_AI_BETA: (lambda: litellm.VertexGeminiConfig(), False),
LlmProviders.CLOUDFLARE: (lambda: litellm.CloudflareChatConfig(), False),
LlmProviders.SAGEMAKER_CHAT: (lambda: litellm.SagemakerChatConfig(), False),
LlmProviders.SAGEMAKER_NOVA: (lambda: litellm.SagemakerNovaConfig(), False),
LlmProviders.SAGEMAKER: (lambda: litellm.SagemakerConfig(), False),
LlmProviders.FIREWORKS_AI: (lambda: litellm.FireworksAIConfig(), False),
LlmProviders.FRIENDLIAI: (lambda: litellm.FriendliaiChatConfig(), False),
Expand Down
Loading
Loading