ai-dynamo · atchernych · Oct 8, 2025
@@ -196,7 +196,7 @@ You can configure the plugin by setting environment vars in your [values-epp-awa
 - Set `DYNAMO_ROUTER_REPLICA_SYNC=true` to enable a background watcher to keep multiple router instances in sync (important if you run more than one KV router per component).
 - By default the Dynamo plugin uses KV routing. You can expose `DYNAMO_USE_KV_ROUTING=false`  in your [values-epp-aware.yaml] if you prefer to route in the round-robin fashion.
 - If using kv-routing:
-  - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-epp-aware.yaml](./values-epp-aware.yaml) to match your model's block size.The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
+  - Overwrite the `DYNAMO_KV_BLOCK_SIZE` in your [values-epp-aware.yaml](./values-epp-aware.yaml) to match your inference engine's block size. The default is 16. You can change the default in your model deployment file i.e. ag.yaml in the cli `"python3 -m dynamo.vllm--block-size 128"`. The `DYNAMO_KV_BLOCK_SIZE` env var is ***MANDATORY*** to prevent silent KV routing failures.
   - Set `DYNAMO_OVERLAP_SCORE_WEIGHT` to weigh how heavily the score uses token overlap (predicted KV cache hits) versus other factors (load, historical hit rate). Higher weight biases toward reusing workers with similar cached prefixes.
   - Set `DYNAMO_ROUTER_TEMPERATURE` to soften or sharpen the selection curve when combining scores. Low temperature makes the router pick the top candidate deterministically; higher temperature lets lower-scoring workers through more often (exploration).
   - Set `DYNAMO_USE_KV_EVENTS=false` if you want to disable KV event tracking while using kv-routing
@@ -218,7 +218,9 @@ Key configurations include:
 - A service for the inference gateway
 - Required RBAC roles and bindings
 - RBAC permissions
-- values-epp-aware.yaml sets eppAware.dynamoNamespace=vllm-agg for the bundled example. Point it at your actual Dynamo namespace by editing that file or adding --set eppAware.dynamoNamespace=<namespace> (and likewise for dynamoComponent, dynamoKvBlockSize if they differ).
+
+- values-epp-aware.yaml sets `eppAware.dynamoNamespace=vllm-agg` for the bundled example. Point it at your actual Dynamo namespace by editing that file or adding --set eppAware.dynamoNamespace=<namespace>
+- `--set-string eppAware.dynamoKvBlockSize=128` can be overwritten to match you inference engine config. You can also use env var KV_BLOCK_SIZE
 
 ### 5. Verify Installation ###
 

@@ -70,6 +70,7 @@ spec:
         {{- $ns := required "set eppAware.dynamoNamespace via values" .Values.eppAware.dynamoNamespace -}}
         {{- $comp := default "backend" .Values.eppAware.dynamoComponent -}}
         {{- $kv := default "16" .Values.eppAware.dynamoKvBlockSize -}}
+        {{- $kv := tpl $kvTpl . | trim -}}
 
         {{- if .Values.eppAware.enabled }}
         volumeMounts:

@@ -18,7 +18,7 @@ eppAware:
   eppImage: nvcr.io/nvstaging/ai-dynamo/gaie-epp-dynamo:v0.6.0-1
   dynamoNamespace: vllm-agg
   dynamoComponent: backend
-  dynamoKvBlockSize: "16"
+  dynamoKvBlockSize: '{{ env "KV_BLOCK_SIZE" | default "16" }}'
 
 imagePullSecrets:
   - docker-imagepullsecret

@@ -70,6 +70,15 @@ Example:
 ./run.sh --model llama-3-70b --framework vllm --deployment-type agg
 ```
 
+## If deploying with Gateway API Inference extension GAIE
+
+```bash
+# Match the block size to the cli value  in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128"
+export DYNAMO_KV_BLOCK_SIZE=128
+export EPP_IMAGE=nvcr.io/you/epp:tag
+# Add --gaie argument to the script i.e.:
+./recipes/run.sh --model llama-3-70b --framework vllm --gaie agg
+```
 
 ## Dry run mode
 

@@ -12,7 +12,6 @@ spec:
   services:
     Frontend:
       componentType: frontend
-      dynamoNamespace: llama3-70b-agg
       volumeMounts:
         - name: model-cache
           mountPoint: /root/.cache/huggingface

@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+# ===== Config (env overridable) =====
+: "${NAMESPACE:=dynamo}"
+: "${RELEASE:=dynamo-gaie}"
+: "${EPP_IMAGE:?EPP_IMAGE must be set, e.g. nvcr.io/your/epp:tag}"
+
+# Per-recipe values
+: "${RECIPE_VALUES_1:=model-gaie.yaml}"
+: "${RECIPE_VALUES_2:=values-epp-aware.yaml}"
+
+# ===== Paths =====
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+# Find repo root
+if GIT_TOP=$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null); then
+  REPO_ROOT="$GIT_TOP"
+else
+  REPO_ROOT="$(cd "$SCRIPT_DIR/../../../../../" && pwd)"
+fi
+
+CHART_DIR="$REPO_ROOT/deploy/inference-gateway/helm/dynamo-gaie"
+
+if [[ ! -d "$CHART_DIR" ]]; then
+  echo "ERROR: GAIE chart not found at: $CHART_DIR"
+  exit 1
+fi
+
+# ===== Pre-flight checks =====
+command -v helm >/dev/null 2>&1 || { echo "ERROR: helm not found"; exit 1; }
+command -v kubectl >/dev/null 2>&1 || { echo "ERROR: kubectl not found"; exit 1; }
+
+# ===== Namespace ensure =====
+if ! kubectl get ns "$NAMESPACE" >/dev/null 2>&1; then
+  kubectl create namespace "$NAMESPACE"
+fi
+
+# ===== Build chart deps (if any) =====
+helm dependency build "$CHART_DIR" >/dev/null
+
+# ===== Compose -f args from local files if present =====
+VALUES_ARGS=()
+if [[ -f "$SCRIPT_DIR/$RECIPE_VALUES_1" ]]; then
+  VALUES_ARGS+=(-f "$SCRIPT_DIR/$RECIPE_VALUES_1")
+fi
+if [[ -f "$SCRIPT_DIR/$RECIPE_VALUES_2" ]]; then
+  VALUES_ARGS+=(-f "$SCRIPT_DIR/$RECIPE_VALUES_2")
+fi
+
+# Allow caller to add more -f/--set/etc via passthrough args
+# Example:
+#   ./deploy.sh --set eppAware.extraEnv[0].name=FOO --set eppAware.extraEnv[0].value=bar
+EXTRA_ARGS=( "$@" )
+
+# ===== Install/upgrade =====
+echo "==> Deploying GAIE chart"
+echo "    Release:   $RELEASE"
+echo "    Namespace: $NAMESPACE"
+echo "    Chart:     $CHART_DIR"
+echo "    EPP_IMAGE: $EPP_IMAGE"
+helm upgrade --install "$RELEASE" "$CHART_DIR" \
+  -n "$NAMESPACE" \
+  "${VALUES_ARGS[@]}" \
+  --set eppAware.enabled=true \
+  --set-string eppAware.eppImage="$EPP_IMAGE" \
+  "${EXTRA_ARGS[@]}"
+
+echo "Done."
+
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Default values for dynamo-gaie.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# This is the Dynamo namespace where the dynamo model is deployed
+dynamoNamespace: "llama3-70b-agg"
+
+# This is the port on which the model is exposed
+model:
+  # This is the model name that will be used to route traffic to the dynamo model
+  # for example, if the model name is Qwen/Qwen3-0.6B, then the modelShortName should be qwen
+  # This is used for the modelName in InferencePool GAIE CR.
+  # The Gateway API matches incoming requests whose model field equals that identifier and then looks up the bound pool.
+  identifier: "llama3-70b-agg"
+  # This is the short name of the model that will be used to generate the resource names in kubernetes.
+  shortName: "llama3"
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+eppAware:
+  enabled: true
+  eppImage: nvcr.io/nvstaging/ai-dynamo/gaie-epp-dynamo:v0.6.0-1
+  dynamoNamespace: llama3-70b-agg
+  dynamoComponent: backend
+  # This must match your cli parameter in --block-size 128 in your VLLM deploy file deploy.yaml.
+  dynamoKvBlockSize: '{{ env "DYNAMO_KV_BLOCK_SIZE" | default "128" }}'
+
+imagePullSecrets:
+  - docker-imagepullsecret
+
+platformReleaseName: dynamo-platform
+platformNamespace: "dynamo"
+
+epp:
+  extraEnv:
+    - name: USE_STREAMING
+      value: "true"
@@ -19,6 +19,7 @@ RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )"
 NAMESPACE="${NAMESPACE:-dynamo}"
 DOWNLOAD_MODEL=true
 DEPLOY_TYPE=""
+GAIE="${GAIE:-false}"
 MODEL=""
 FRAMEWORK=""
 DRY_RUN=""
@@ -98,6 +99,10 @@ while [[ $# -gt 0 ]]; do
                 missing_requirement "$1"
             fi
             ;;
++       --gaie)
++            GAIE=true
++            shift
++            ;;
         -h|--help)
             usage
             ;;
@@ -142,6 +147,9 @@ fi
 MODEL_DIR="$RECIPES_DIR/$MODEL"
 FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}"
 DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOY_TYPE"
+INTEGRATION="$([[ "$GAIE" == "true" ]] && echo gaie || echo "")"
+INTEGRATION_PATH="$DEPLOY_PATH/$INTEGRATION"
+INTEG_DEPLOY_SCRIPT="$INTEGRATION_PATH/deploy.sh"
 
 # Check if model directory exists
 if [[ ! -d "$MODEL_DIR" ]]; then
@@ -212,6 +220,18 @@ fi
 echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOY_TYPE configuration..."
 $DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE
 
+if [[ "$INTEGRATION" == "gaie" ]]; then
+    if [[ -x "$INTEG_DEPLOY_SCRIPT" ]]; then
+        $DRY_RUN "$INTEG_DEPLOY_SCRIPT"
+    else
+        echo "Error: Expected executable '$INTEG_DEPLOY_SCRIPT' for GAIE integration."
+        echo "Hint: create $INTEG_DEPLOY_SCRIPT and make it executable (chmod +x)."
+        exit 1
+    fi
+    # For now do not run the benchmark
+    exit
+fi
+
 # Launch the benchmark job
 echo "Launching benchmark job..."
 $DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE