diff --git a/deploy/inference-gateway/README.md b/deploy/inference-gateway/README.md index 42cf255b06..fd63874d22 100644 --- a/deploy/inference-gateway/README.md +++ b/deploy/inference-gateway/README.md @@ -196,7 +196,7 @@ You can configure the plugin by setting environment vars in your [values-epp-awa - Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component). - By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false` in your [values-epp-aware.yaml] if you prefer to route in the round-robin fashion. - If using kv-routing: - - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-epp-aware.yaml](./values-epp-aware.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures. + - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-epp-aware.yaml](./values-epp-aware.yaml) to match your inference engine's block size. The default is 16. You can change the default in your model deployment file i.e. ag.yaml in the cli `"python3 -m dynamo.vllm--block-size 128"`. The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures. - Set `DYNAMO_OVERLAP_SCORE_WEIGHT` to weigh how heavily the score uses token overlap (predicted KV cache hits) versus other factors (load, historical hit rate). Higher weight biases toward reusing workers with similar cached prefixes. - Set `DYNAMO_ROUTER_TEMPERATURE` to soften or sharpen the selection curve when combining scores. Low temperature makes the router pick the top candidate deterministically; higher temperature lets lower-scoring workers through more often (exploration). - Set `DYNAMO_USE_KV_EVENTS=false` if you want to disable KV event tracking while using kv-routing @@ -218,7 +218,9 @@ Key configurations include: - A service for the inference gateway - Required RBAC roles and bindings - RBAC permissions -- values-epp-aware.yaml sets eppAware.dynamoNamespace=vllm-agg for the bundled example. Point it at your actual Dynamo namespace by editing that file or adding --set eppAware.dynamoNamespace= (and likewise for dynamoComponent, dynamoKvBlockSize if they differ). + +- values-epp-aware.yaml sets `eppAware.dynamoNamespace=vllm-agg` for the bundled example. Point it at your actual Dynamo namespace by editing that file or adding --set eppAware.dynamoNamespace= +- `--set-string eppAware.dynamoKvBlockSize=128` can be overwritten to match you inference engine config. You can also use env var KV_BLOCK_SIZE ### 5. Verify Installation ### diff --git a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml index 826725a586..665c1405a3 100644 --- a/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml +++ b/deploy/inference-gateway/helm/dynamo-gaie/templates/dynamo-epp.yaml @@ -70,6 +70,7 @@ spec: {{- $ns := required "set eppAware.dynamoNamespace via values" .Values.eppAware.dynamoNamespace -}} {{- $comp := default "backend" .Values.eppAware.dynamoComponent -}} {{- $kv := default "16" .Values.eppAware.dynamoKvBlockSize -}} + {{- $kv := tpl $kvTpl . | trim -}} {{- if .Values.eppAware.enabled }} volumeMounts: diff --git a/deploy/inference-gateway/values-epp-aware.yaml b/deploy/inference-gateway/values-epp-aware.yaml index 626c379020..ad92ba33d1 100644 --- a/deploy/inference-gateway/values-epp-aware.yaml +++ b/deploy/inference-gateway/values-epp-aware.yaml @@ -18,7 +18,7 @@ eppAware: eppImage: nvcr.io/nvstaging/ai-dynamo/gaie-epp-dynamo:v0.6.0-1 dynamoNamespace: vllm-agg dynamoComponent: backend - dynamoKvBlockSize: "16" + dynamoKvBlockSize: '{{ env "KV_BLOCK_SIZE" | default "16" }}' imagePullSecrets: - docker-imagepullsecret diff --git a/recipes/README.md b/recipes/README.md index 81125d07c6..281a631dc2 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -70,6 +70,15 @@ Example: ./run.sh --model llama-3-70b --framework vllm --deployment-type agg ``` +## If deploying with Gateway API Inference extension GAIE + +```bash +# Match the block size to the cli value in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128" +export DYNAMO_KV_BLOCK_SIZE=128 +export EPP_IMAGE=nvcr.io/you/epp:tag +# Add --gaie argument to the script i.e.: +./recipes/run.sh --model llama-3-70b --framework vllm --gaie agg +``` ## Dry run mode diff --git a/recipes/llama-3-70b/vllm/agg/deploy.yaml b/recipes/llama-3-70b/vllm/agg/deploy.yaml index c6afcc21ac..0a1ff973d3 100644 --- a/recipes/llama-3-70b/vllm/agg/deploy.yaml +++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml @@ -12,7 +12,6 @@ spec: services: Frontend: componentType: frontend - dynamoNamespace: llama3-70b-agg volumeMounts: - name: model-cache mountPoint: /root/.cache/huggingface diff --git a/recipes/llama-3-70b/vllm/agg/gaie/deploy.sh b/recipes/llama-3-70b/vllm/agg/gaie/deploy.sh new file mode 100755 index 0000000000..d670dea81e --- /dev/null +++ b/recipes/llama-3-70b/vllm/agg/gaie/deploy.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#!/usr/bin/env bash +set -Eeuo pipefail + +# ===== Config (env overridable) ===== +: "${NAMESPACE:=dynamo}" +: "${RELEASE:=dynamo-gaie}" +: "${EPP_IMAGE:?EPP_IMAGE must be set, e.g. nvcr.io/your/epp:tag}" + +# Per-recipe values +: "${RECIPE_VALUES_1:=model-gaie.yaml}" +: "${RECIPE_VALUES_2:=values-epp-aware.yaml}" + +# ===== Paths ===== +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Find repo root +if GIT_TOP=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null); then + REPO_ROOT="$GIT_TOP" +else + REPO_ROOT="$(cd "$SCRIPT_DIR/../../../../../" && pwd)" +fi + +CHART_DIR="$REPO_ROOT/deploy/inference-gateway/helm/dynamo-gaie" + +if [[ ! -d "$CHART_DIR" ]]; then + echo "ERROR: GAIE chart not found at: $CHART_DIR" + exit 1 +fi + +# ===== Pre-flight checks ===== +command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; } +command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; } + +# ===== Namespace ensure ===== +if ! kubectl get ns "$NAMESPACE" >/dev/null 2>&1; then + kubectl create namespace "$NAMESPACE" +fi + +# ===== Build chart deps (if any) ===== +helm dependency build "$CHART_DIR" >/dev/null + +# ===== Compose -f args from local files if present ===== +VALUES_ARGS=() +if [[ -f "$SCRIPT_DIR/$RECIPE_VALUES_1" ]]; then + VALUES_ARGS+=(-f "$SCRIPT_DIR/$RECIPE_VALUES_1") +fi +if [[ -f "$SCRIPT_DIR/$RECIPE_VALUES_2" ]]; then + VALUES_ARGS+=(-f "$SCRIPT_DIR/$RECIPE_VALUES_2") +fi + +# Allow caller to add more -f/--set/etc via passthrough args +# Example: +# ./deploy.sh --set eppAware.extraEnv[0].name=FOO --set eppAware.extraEnv[0].value=bar +EXTRA_ARGS=( "$@" ) + +# ===== Install/upgrade ===== +echo "==> Deploying GAIE chart" +echo " Release: $RELEASE" +echo " Namespace: $NAMESPACE" +echo " Chart: $CHART_DIR" +echo " EPP_IMAGE: $EPP_IMAGE" +helm upgrade --install "$RELEASE" "$CHART_DIR" \ + -n "$NAMESPACE" \ + "${VALUES_ARGS[@]}" \ + --set eppAware.enabled=true \ + --set-string eppAware.eppImage="$EPP_IMAGE" \ + "${EXTRA_ARGS[@]}" + +echo "Done." + diff --git a/recipes/llama-3-70b/vllm/agg/gaie/model-gaie.yaml b/recipes/llama-3-70b/vllm/agg/gaie/model-gaie.yaml new file mode 100644 index 0000000000..880b0a730e --- /dev/null +++ b/recipes/llama-3-70b/vllm/agg/gaie/model-gaie.yaml @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Default values for dynamo-gaie. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +# This is the Dynamo namespace where the dynamo model is deployed +dynamoNamespace: "llama3-70b-agg" + +# This is the port on which the model is exposed +model: + # This is the model name that will be used to route traffic to the dynamo model + # for example, if the model name is Qwen/Qwen3-0.6B, then the modelShortName should be qwen + # This is used for the modelName in InferencePool GAIE CR. + # The Gateway API matches incoming requests whose model field equals that identifier and then looks up the bound pool. + identifier: "llama3-70b-agg" + # This is the short name of the model that will be used to generate the resource names in kubernetes. + shortName: "llama3" diff --git a/recipes/llama-3-70b/vllm/agg/gaie/values-epp-aware.yaml b/recipes/llama-3-70b/vllm/agg/gaie/values-epp-aware.yaml new file mode 100644 index 0000000000..f832e7e36b --- /dev/null +++ b/recipes/llama-3-70b/vllm/agg/gaie/values-epp-aware.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +eppAware: + enabled: true + eppImage: nvcr.io/nvstaging/ai-dynamo/gaie-epp-dynamo:v0.6.0-1 + dynamoNamespace: llama3-70b-agg + dynamoComponent: backend + # This must match your cli parameter in --block-size 128 in your VLLM deploy file deploy.yaml. + dynamoKvBlockSize: '{{ env "DYNAMO_KV_BLOCK_SIZE" | default "128" }}' + +imagePullSecrets: + - docker-imagepullsecret + +platformReleaseName: dynamo-platform +platformNamespace: "dynamo" + +epp: + extraEnv: + - name: USE_STREAMING + value: "true" diff --git a/recipes/run.sh b/recipes/run.sh index e611d39711..b5c80a766d 100755 --- a/recipes/run.sh +++ b/recipes/run.sh @@ -19,6 +19,7 @@ RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )" NAMESPACE="${NAMESPACE:-dynamo}" DOWNLOAD_MODEL=true DEPLOY_TYPE="" +GAIE="${GAIE:-false}" MODEL="" FRAMEWORK="" DRY_RUN="" @@ -98,6 +99,10 @@ while [[ $# -gt 0 ]]; do missing_requirement "$1" fi ;; ++ --gaie) ++ GAIE=true ++ shift ++ ;; -h|--help) usage ;; @@ -142,6 +147,9 @@ fi MODEL_DIR="$RECIPES_DIR/$MODEL" FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}" DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOY_TYPE" +INTEGRATION="$([[ "$GAIE" == "true" ]] && echo gaie || echo "")" +INTEGRATION_PATH="$DEPLOY_PATH/$INTEGRATION" +INTEG_DEPLOY_SCRIPT="$INTEGRATION_PATH/deploy.sh" # Check if model directory exists if [[ ! -d "$MODEL_DIR" ]]; then @@ -212,6 +220,18 @@ fi echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOY_TYPE configuration..." $DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE +if [[ "$INTEGRATION" == "gaie" ]]; then + if [[ -x "$INTEG_DEPLOY_SCRIPT" ]]; then + $DRY_RUN "$INTEG_DEPLOY_SCRIPT" + else + echo "Error: Expected executable '$INTEG_DEPLOY_SCRIPT' for GAIE integration." + echo "Hint: create $INTEG_DEPLOY_SCRIPT and make it executable (chmod +x)." + exit 1 + fi + # For now do not run the benchmark + exit +fi + # Launch the benchmark job echo "Launching benchmark job..." $DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE