From 865c8ef8a4675c4d841244df38a1fc53c2b585a8 Mon Sep 17 00:00:00 2001
From: Anna Tchernych <atchernych@nvidia.com>
Date: Fri, 10 Oct 2025 12:03:28 -0700
Subject: [PATCH 1/3] move the EPP build docker file

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
---
 container/Dockerfile.epp                      |   70 ++
 deploy/inference-gateway/build-epp-dynamo.sh  |   39 +-
 .../epp-v0.5.1-2/epp-v0.5.1-dyn2.patch        | 1013 +++++++++++++++--
 3 files changed, 1033 insertions(+), 89 deletions(-)
 create mode 100644 container/Dockerfile.epp

diff --git a/container/Dockerfile.epp b/container/Dockerfile.epp
new file mode 100644
index 0000000000..d7d7a6c2d9
--- /dev/null
+++ b/container/Dockerfile.epp
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Dockerfile.epp - Custom Dockerfile for GAIE EPP. This is to be used with the deploy/inference-gateway/build-epp-dynamo.sh
+
+ARG BUILDER_IMAGE=golang:1.24
+ARG BASE_IMAGE=ubuntu:22.04
+
+############################
+# Builder
+############################
+FROM ${BUILDER_IMAGE} AS builder
+
+ENV CGO_ENABLED=1
+ENV GOOS=linux
+ENV GOARCH=amd64
+# be explicit; helps cgo when linking libstdc++
+ENV CC=gcc
+ENV CXX=g++
+
+# C/C++ toolchain for cgo, and libstdc++ for link-time
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    gcc g++ \
+    libc6-dev \
+    ca-certificates \
+ && rm -rf /var/lib/apt/lists/*
+
+ARG COMMIT_SHA=unknown
+ARG BUILD_REF
+
+WORKDIR /src
+
+# deps first (cache)
+COPY go.mod go.sum ./
+RUN go mod download
+
+# source
+COPY cmd/epp ./cmd/epp
+COPY pkg/epp ./pkg/epp
+COPY internal ./internal
+COPY api ./api
+
+# sanity (optional)
+RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
+RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
+
+# build
+WORKDIR /src/cmd/epp
+RUN go build \
+  -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
+  -o /epp
+
+############################
+# Runtime
+############################
+FROM ${BASE_IMAGE} AS runtime
+
+# Minimal runtime deps; include libstdc++ runtime for -lstdc++
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    libstdc++6 \
+ && rm -rf /var/lib/apt/lists/* \
+ && groupadd -r nonroot && useradd -r -g nonroot nonroot
+
+WORKDIR /
+COPY --from=builder /epp /epp
+
+USER nonroot:nonroot
+ENTRYPOINT ["/epp"]
diff --git a/deploy/inference-gateway/build-epp-dynamo.sh b/deploy/inference-gateway/build-epp-dynamo.sh
index 40caa4cc6c..55fc2fb898 100755
--- a/deploy/inference-gateway/build-epp-dynamo.sh
+++ b/deploy/inference-gateway/build-epp-dynamo.sh
@@ -23,23 +23,23 @@ if [[ -z "${DYNAMO_DIR}" ]]; then
     exit 1
 fi
 
-if [[ -z "${EPP_DIR}" ]]; then
-    echo "EPP_DIR environment variable must be set"
-    echo "   Example: export EPP_DIR=/path/to/gateway-api-inference-extension-dynamo"
+if [[ -z "${GAIE_DIR}" ]]; then
+    echo "GAIE_DIR environment variable must be set"
+    echo "   Example: export GAIE_DIR=/path/to/gateway-api-inference-extension"
     exit 1
 fi
-DYNAMO_LIB_DIR="${EPP_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
-DYNAMO_INCLUDE_DIR="${EPP_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
+DYNAMO_LIB_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib"
+DYNAMO_INCLUDE_DIR="${GAIE_DIR}/pkg/epp/scheduling/plugins/dynamo_kv_scorer/include"
 
-echo "🏗️  Building Dynamo KV Router C Library..."
+echo "Building Dynamo KV Router C Library..."
 
 # Step 1: Build the static library
-echo "📦 Building static library..."
+echo "Building static library..."
 cd "${DYNAMO_DIR}"
 cargo build --release -p libdynamo_llm
 
 # Step 2: Generate header file (with fallback)
-echo "📝 Generating C header..."
+echo "Generating C header..."
 HEADER_OUTPUT="${DYNAMO_DIR}/lib/bindings/c/include/nvidia/dynamo_llm/llm_engine.h"
 
 if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --output "${HEADER_OUTPUT}"; then
@@ -47,15 +47,16 @@ if ! cbindgen --config lib/bindings/c/cbindgen.toml --crate libdynamo_llm --outp
     cp "${DYNAMO_DIR}/lib/bindings/c/src/fallback_header.h" "${HEADER_OUTPUT}"
 fi
 
-# Step 3: Ensure EPP directories exist
-echo "Preparing EPP directories..."
+# Step 3: Ensure directories exist
+echo "Preparing directories..."
 mkdir -p "${DYNAMO_LIB_DIR}"
 mkdir -p "${DYNAMO_INCLUDE_DIR}"
 
-# Step 4: Copy files to EPP
-echo "Copying files to EPP..."
+# Step 4: Copy files to GAIE project
+echo "Copying files to the GAIE project..."
 cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
 cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
+cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}"
 
 # Verify files were copied
 if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then
@@ -68,13 +69,19 @@ if [[ ! -f "${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a" ]]; then
     exit 1
 fi
 
+if [[ ! -f "${GAIE_DIR}/Dockerfile.epp" ]]; then
+    echo "Docker.epp file copy failed!"
+    exit 1
+fi
+
 echo "Files copied successfully:"
 echo "   Header: ${DYNAMO_INCLUDE_DIR}/llm_engine.h"
 echo "   Library: ${DYNAMO_LIB_DIR}/libdynamo_llm_capi.a"
+echo "   Docker: ${GAIE_DIR}/Dockerfile.epp"
 
 # Step 5: Apply Dynamo patch (if it exists)
-echo "🔧 Applying Dynamo patch..."
-cd "${EPP_DIR}"
+echo "Applying Dynamo patch..."
+cd "${GAIE_DIR}"
 
 PATCH_FILE="${DYNAMO_DIR}/deploy/inference-gateway/epp-patches/v0.5.1-2/epp-v0.5.1-dyn2.patch"
 if [[ -f "${PATCH_FILE}" ]]; then
@@ -89,7 +96,7 @@ else
 fi
 
 # Step 6: Build the EPP image
-echo "Building the EPP image..."
+echo "Building the custom EPP image for GAIE..."
 make dynamo-image-local-load
 
-echo "EPP with Dynamo KV routing built"
+echo "EPP image with Dynamo KV routing built"
diff --git a/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch b/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
index a6b92ce376..91a8b09ec6 100644
--- a/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
+++ b/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
@@ -1,75 +1,3 @@
-diff --git a/Dockerfile.dynamo b/Dockerfile.dynamo
-new file mode 100644
-index 0000000..3f0e0a0
---- /dev/null
-+++ b/Dockerfile.dynamo
-@@ -0,0 +1,66 @@
-+# Dockerfile.dynamo - Custom Dockerfile for Dynamo FFI plugin
-+ARG BUILDER_IMAGE=golang:1.24
-+ARG BASE_IMAGE=ubuntu:22.04
-+
-+############################
-+# Builder
-+############################
-+FROM ${BUILDER_IMAGE} AS builder
-+
-+ENV CGO_ENABLED=1
-+ENV GOOS=linux
-+ENV GOARCH=amd64
-+# be explicit; helps cgo when linking libstdc++
-+ENV CC=gcc
-+ENV CXX=g++
-+
-+# C/C++ toolchain for cgo, and libstdc++ for link-time
-+RUN apt-get update && apt-get install -y --no-install-recommends \
-+    build-essential \
-+    gcc g++ \
-+    libc6-dev \
-+    ca-certificates \
-+ && rm -rf /var/lib/apt/lists/*
-+
-+ARG COMMIT_SHA=unknown
-+ARG BUILD_REF
-+
-+WORKDIR /src
-+
-+# deps first (cache)
-+COPY go.mod go.sum ./
-+RUN go mod download
-+
-+# source
-+COPY cmd/epp ./cmd/epp
-+COPY pkg/epp ./pkg/epp
-+COPY internal ./internal
-+COPY api ./api
-+
-+# sanity (optional)
-+RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/include/ || echo "Headers not found"
-+RUN ls -la pkg/epp/scheduling/plugins/dynamo_kv_scorer/lib/ || echo "Library not found"
-+
-+# build
-+WORKDIR /src/cmd/epp
-+RUN go build \
-+  -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" \
-+  -o /epp
-+
-+############################
-+# Runtime
-+############################
-+FROM ${BASE_IMAGE} AS runtime
-+
-+# Minimal runtime deps; include libstdc++ runtime for -lstdc++
-+RUN apt-get update && apt-get install -y --no-install-recommends \
-+    ca-certificates \
-+    libstdc++6 \
-+ && rm -rf /var/lib/apt/lists/* \
-+ && groupadd -r nonroot && useradd -r -g nonroot nonroot
-+
-+WORKDIR /
-+COPY --from=builder /epp /epp
-+
-+USER nonroot:nonroot
-+ENTRYPOINT ["/epp"]
 diff --git a/Makefile b/Makefile
 index dee7e99..4679ce2 100644
 --- a/Makefile
@@ -149,6 +77,509 @@ index b5e0617..8592735 100644
  	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
  		os.Exit(1)
  	}
+diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
+index f4a2c9b..692d2e4 100644
+--- a/cmd/epp/runner/runner.go
++++ b/cmd/epp/runner/runner.go
+@@ -18,8 +18,10 @@ package runner
+ 
+ import (
+ 	"context"
++	"crypto/tls"
+ 	"flag"
+ 	"fmt"
++	"net/http"
+ 	"net/http/pprof"
+ 	"os"
+ 
+@@ -136,7 +138,9 @@ var (
+ 
+ 	modelServerMetricsPort = flag.Int("modelServerMetricsPort", 0, "Port to scrape metrics from pods. "+
+ 		"Default value will be set to InferencePool.Spec.TargetPortNumber if not set.")
+-	modelServerMetricsPath = flag.String("modelServerMetricsPath", "/metrics", "Path to scrape metrics from pods")
++	modelServerMetricsPath                    = flag.String("modelServerMetricsPath", "/metrics", "Path to scrape metrics from pods")
++	modelServerMetricsScheme                  = flag.String("modelServerMetricsScheme", "http", "Scheme to scrape metrics from pods")
++	modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("modelServerMetricsHttpsInsecureSkipVerify", true, "When using 'https' scheme for 'modelServerMetricsScheme', configure 'InsecureSkipVerify' (default to true)")
+ 
+ 	setupLog = ctrl.Log.WithName("setup")
+ )
+@@ -167,13 +171,15 @@ func (r *Runner) WithSchedulerConfig(schedulerConfig *scheduling.SchedulerConfig
+ func bindEnvToFlags() {
+ 	// map[ENV_VAR]flagName   – add more as needed
+ 	for env, flg := range map[string]string{
+-		"GRPC_PORT":                     "grpcPort",
+-		"GRPC_HEALTH_PORT":              "grpcHealthPort",
+-		"MODEL_SERVER_METRICS_PORT":     "modelServerMetricsPort",
+-		"MODEL_SERVER_METRICS_PATH":     "modelServerMetricsPath",
+-		"DESTINATION_ENDPOINT_HINT_KEY": "destinationEndpointHintKey",
+-		"POOL_NAME":                     "poolName",
+-		"POOL_NAMESPACE":                "poolNamespace",
++		"GRPC_PORT":                                       "grpcPort",
++		"GRPC_HEALTH_PORT":                                "grpcHealthPort",
++		"MODEL_SERVER_METRICS_PORT":                       "modelServerMetricsPort",
++		"MODEL_SERVER_METRICS_PATH":                       "modelServerMetricsPath",
++		"DESTINATION_ENDPOINT_HINT_KEY":                   "destinationEndpointHintKey",
++		"MODEL_SERVER_METRICS_SCHEME":                     "modelServerMetricsScheme",
++		"MODEL_SERVER_METRICS_HTTPS_INSECURE_SKIP_VERIFY": "modelServerMetricsHttpsInsecureSkipVerify",
++		"POOL_NAME":                                       "poolName",
++		"POOL_NAMESPACE":                                  "poolNamespace",
+ 		// durations & bools work too; flag.Set expects the *string* form
+ 		"REFRESH_METRICS_INTERVAL": "refreshMetricsInterval",
+ 		"SECURE_SERVING":           "secureServing",
+@@ -231,10 +237,26 @@ func (r *Runner) Run(ctx context.Context) error {
+ 		return err
+ 	}
+ 	verifyMetricMapping(*mapping, setupLog)
++
++	var metricsHttpClient *http.Client
++	if *modelServerMetricsScheme == "https" {
++		metricsHttpClient = &http.Client{
++			Transport: &http.Transport{
++				TLSClientConfig: &tls.Config{
++					InsecureSkipVerify: *modelServerMetricsHttpsInsecureSkipVerify,
++				},
++			},
++		}
++	} else {
++		metricsHttpClient = http.DefaultClient
++	}
++
+ 	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{
+-		MetricMapping:          mapping,
+-		ModelServerMetricsPort: int32(*modelServerMetricsPort),
+-		ModelServerMetricsPath: *modelServerMetricsPath,
++		MetricMapping:            mapping,
++		ModelServerMetricsPort:   int32(*modelServerMetricsPort),
++		ModelServerMetricsPath:   *modelServerMetricsPath,
++		ModelServerMetricsScheme: *modelServerMetricsScheme,
++		Client:                   metricsHttpClient,
+ 	}, *refreshMetricsInterval)
+ 
+ 	datastore := datastore.NewDatastore(ctx, pmf)
+@@ -348,6 +370,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
+ 		return fmt.Errorf("failed to load the configuration - %w", err)
+ 	}
+ 
++	setupLog.Info("Configuration file loaded", "config", config)
++
+ 	r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)
+ 	if err != nil {
+ 		return fmt.Errorf("failed to create Scheduler configuration - %w", err)
+@@ -410,6 +434,9 @@ func validateFlags() error {
+ 	if *configText != "" && *configFile != "" {
+ 		return fmt.Errorf("both the %q and %q flags can not be set at the same time", "configText", "configFile")
+ 	}
++	if *modelServerMetricsScheme != "http" && *modelServerMetricsScheme != "https" {
++		return fmt.Errorf("unexpected %q value for %q flag, it can only be set to 'http' or 'https'", *modelServerMetricsScheme, "model-server-metrics-scheme")
++	}
+ 
+ 	return nil
+ }
+diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml
+index 0b88dc4..caccbc9 100644
+--- a/config/charts/body-based-routing/values.yaml
++++ b/config/charts/body-based-routing/values.yaml
+@@ -3,8 +3,8 @@ bbr:
+   replicas: 1
+   image:
+     name: bbr
+-    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+-    tag: main
++    hub: registry.k8s.io/gateway-api-inference-extension
++    tag: v0.5.1
+     pullPolicy: Always
+   port: 9004
+   healthCheckPort: 9005
+diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
+index bed4f33..b8a8d0a 100644
+--- a/config/charts/inferencepool/README.md
++++ b/config/charts/inferencepool/README.md
+@@ -24,26 +24,44 @@ Note that the provider name is needed to deploy provider-specific resources. If
+ 
+ ### Install with Custom Environment Variables
+ 
+-To set custom environment variables for the EndpointPicker deployment:
++To set custom environment variables for the EndpointPicker deployment, you can define them as free-form YAML in the `values.yaml` file:
++
++```yaml
++inferenceExtension:
++  env:
++    - name: FEATURE_FLAG_ENABLED
++      value: "true"
++    - name: CUSTOM_ENV_VAR
++      value: "custom_value"
++    - name: POD_IP
++      valueFrom:
++        fieldRef:
++          fieldPath: status.podIP
++```
++
++Then apply it with:
+ 
+ ```txt
+-$ helm install vllm-llama3-8b-instruct \
+-  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
+-  --set provider.name=[none|gke] \
+-  --set inferenceExtension.env.FEATURE_FLAG_ENABLED=true \
+-  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
++$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+ ```
+ 
+-Alternatively, you can define environment variables in a values file:
++### Install with Additional Ports
++
++To expose additional ports (e.g., for ZMQ), you can define them in the `values.yaml` file:
+ 
+ ```yaml
+-# values.yaml
+ inferenceExtension:
+-  env:
+-    FEATURE_FLAG_ENABLED: "true"
++  extraContainerPorts:
++    - name: zmq
++      containerPort: 5557
++      protocol: TCP
++  extraServicePorts: # if need to expose the port for external communication
++    - name: zmq
++      port: 5557
++      protocol: TCP
+ ```
+ 
+-And apply it with:
++Then apply it with:
+ 
+ ```txt
+ $ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
+@@ -84,7 +102,10 @@ The following table list the configurable parameters of the chart.
+ | `inferenceExtension.image.tag`              | Image tag of the endpoint picker.                                                                                      |
+ | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`.      |
+ | `inferenceExtension.extProcPort`            | Port where the endpoint picker service is served for external processing. Defaults to `9002`.                          |
+-| `inferenceExtension.env`                    | Map of environment variables to set in the endpoint picker container. Defaults to `{}`.                                |
++| `inferenceExtension.env`                    | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`.             |
++| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                       |
++| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                         |
++| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                   |
+ | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
+ 
+ ## Notes
+diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml
+new file mode 100644
+index 0000000..12cbd58
+--- /dev/null
++++ b/config/charts/inferencepool/templates/epp-config.yaml
+@@ -0,0 +1,85 @@
++apiVersion: v1
++kind: ConfigMap
++metadata:
++  name: {{ include "gateway-api-inference-extension.name" . }}
++  namespace: {{ .Release.Namespace }}
++data:
++  default-plugins.yaml: |
++    apiVersion: inference.networking.x-k8s.io/v1alpha1
++    kind: EndpointPickerConfig
++    plugins:
++    - type: low-queue-filter
++      parameters:
++        threshold: 128
++    - type: lora-affinity-filter
++      parameters:
++        threshold: 0.999
++    - type: least-queue-filter
++    - type: least-kv-cache-filter
++    - type: decision-tree-filter
++      name: low-latency-filter
++      parameters:
++        current:
++          pluginRef: low-queue-filter
++        nextOnSuccess:
++          decisionTree:
++            current:
++              pluginRef: lora-affinity-filter
++            nextOnSuccessOrFailure:
++              decisionTree:
++                current:
++                  pluginRef: least-queue-filter
++                nextOnSuccessOrFailure:
++                  decisionTree:
++                    current:
++                      pluginRef: least-kv-cache-filter
++        nextOnFailure:
++          decisionTree:
++            current:
++              pluginRef: least-queue-filter
++            nextOnSuccessOrFailure:
++              decisionTree:
++                current:
++                  pluginRef: lora-affinity-filter
++                nextOnSuccessOrFailure:
++                  decisionTree:
++                    current:
++                      pluginRef: least-kv-cache-filter
++    - type: random-picker
++      parameters:
++        maxNumOfEndpoints: 1
++    - type: single-profile-handler
++    schedulingProfiles:
++    - name: default
++      plugins:
++      - pluginRef: low-latency-filter
++      - pluginRef: random-picker
++  plugins-v2.yaml: |
++    apiVersion: inference.networking.x-k8s.io/v1alpha1
++    kind: EndpointPickerConfig
++    plugins:
++    - type: queue-scorer
++    - type: kv-cache-scorer
++    - type: prefix-cache-scorer
++      parameters:
++        hashBlockSize: 64
++        maxPrefixBlocksToMatch: 256
++        lruCapacityPerServer: 31250
++    - type: max-score-picker
++      parameters:
++        maxNumOfEndpoints: 1
++    - type: single-profile-handler
++    schedulingProfiles:
++    - name: default
++      plugins:
++      - pluginRef: queue-scorer
++        weight: 1
++      - pluginRef: kv-cache-scorer
++        weight: 1
++      - pluginRef: prefix-cache-scorer
++        weight: 1
++      - pluginRef: max-score-picker
++  {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
++  {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
++  {{- end }}
++  
+diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
+index fec91e4..7edc6a3 100644
+--- a/config/charts/inferencepool/templates/epp-deployment.yaml
++++ b/config/charts/inferencepool/templates/epp-deployment.yaml
+@@ -27,16 +27,21 @@ spec:
+         - {{ .Release.Name }}
+         - -poolNamespace
+         - {{ .Release.Namespace }}
+-        - -v
+-        - "3"
+-        - -grpcPort
++        - --v
++        - "{{ .Values.inferenceExtension.logVerbosity | default "3" }}"
++        - --grpcPort
+         - "9002"
+         - -grpcHealthPort
+         - "9003"
+         - -metricsPort
+         - "9090"
++        - -configFile
++        - "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
+         # https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
+-        - "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
++        - "--enablePprof={{ .Values.inferenceExtension.enablePprof }}"
++        - "--modelServerMetricsPath={{ .Values.inferenceExtension.modelServerMetricsPath }}"
++        - "--modelServerMetricsScheme={{ .Values.inferenceExtension.modelServerMetricsScheme }}"
++        - "--modelServerMetricsHttpsInsecureSkipVerify={{ .Values.inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify }}"
+         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
+         - -totalQueuedRequestsMetric
+         - "nv_trt_llm_request_metrics{request_type=waiting}"
+@@ -52,6 +57,9 @@ spec:
+           containerPort: 9003
+         - name: metrics
+           containerPort: 9090
++        {{- with .Values.inferenceExtension.extraContainerPorts }}
++        {{- toYaml . | nindent 8 }}
++        {{- end }}
+         livenessProbe:
+           grpc:
+             port: 9003
+@@ -64,8 +72,14 @@ spec:
+             service: inference-extension
+           initialDelaySeconds: 5
+           periodSeconds: 10
++        {{- with .Values.inferenceExtension.env }}
+         env:
+-        {{- range $key, $value := .Values.inferenceExtension.env }}
+-        - name: {{ $key }}
+-          value: {{ $value | quote }}
++        {{- toYaml . | nindent 8 }}
+         {{- end }}
++        volumeMounts:
++        - name: plugins-config-volume
++          mountPath: "/config"
++      volumes:
++      - name: plugins-config-volume
++        configMap:
++          name: {{ include "gateway-api-inference-extension.name" . }}
+diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml
+index ed23db1..b1a48df 100644
+--- a/config/charts/inferencepool/templates/epp-service.yaml
++++ b/config/charts/inferencepool/templates/epp-service.yaml
+@@ -15,4 +15,7 @@ spec:
+     - name: http-metrics
+       protocol: TCP
+       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
++    {{- with .Values.inferenceExtension.extraServicePorts }}
++    {{- toYaml . | nindent 4 }}
++    {{- end }}
+   type: ClusterIP
+diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
+index 2b4e800..1541863 100644
+--- a/config/charts/inferencepool/values.yaml
++++ b/config/charts/inferencepool/values.yaml
+@@ -2,16 +2,44 @@ inferenceExtension:
+   replicas: 1
+   image:
+     name: epp
+-    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+-    tag: main
++    hub: registry.k8s.io/gateway-api-inference-extension
++    tag: v0.5.1
+     pullPolicy: Always
+   extProcPort: 9002
+-  env: {}
++  env: []
+   enablePprof: true # Enable pprof handlers for profiling and debugging
++  modelServerMetricsPath: "/metrics"
++  modelServerMetricsScheme: "http"
++  modelServerMetricsHttpsInsecureSkipVerify: true
++  # This is the plugins configuration file. 
++  pluginsConfigFile: "default-plugins.yaml"
++  # pluginsCustomConfig:
++  #   custom-plugins.yaml: |
++  #     apiVersion: inference.networking.x-k8s.io/v1alpha1
++  #     kind: EndpointPickerConfig
++  #     plugins:
++  #     - type: custom-scorer
++  #       parameters:
++  #         custom-threshold: 64
++  #     - type: max-score-picker
++  #     - type: single-profile-handler
++  #     schedulingProfiles:
++  #     - name: default
++  #       plugins:
++  #       - pluginRef: custom-scorer
++  #         weight: 1
++  #       - pluginRef: max-score-picker
++  #         weight: 1
++
+   # Example environment variables:
+   # env:
+   #   KV_CACHE_SCORE_WEIGHT: "1"
+ 
++  # Define additional container ports
++  extraContainerPorts: []
++  # Define additional service ports
++  extraServicePorts: []
++
+ inferencePool:
+   targetPortNumber: 8000
+   modelServerType: vllm # vllm, triton-tensorrt-llm
+diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml
+index 9bb3ea1..cbe3885 100644
+--- a/config/manifests/inferencepool-resources.yaml
++++ b/config/manifests/inferencepool-resources.yaml
+@@ -1,6 +1,8 @@
+-# Note: If you change this file, please also change the file used for e2e tests!
+-# 
+-# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
++# Note: If you change this file, please also change:
++#  - ./test/testdata/inferencepool-e2e.yaml
++#  - ./conformance/resources/manifests/manifests.yaml
++#  - ./site-src/guides/inferencepool-rollout.md
++---
+ apiVersion: inference.networking.x-k8s.io/v1alpha2
+ kind: InferencePool
+ metadata:
+@@ -48,8 +50,8 @@ spec:
+       terminationGracePeriodSeconds: 130
+       containers:
+       - name: epp
+-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+-        imagePullPolicy: Always
++        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
++        imagePullPolicy: IfNotPresent
+         args:
+         - -poolName
+         - "vllm-llama3-8b-instruct"
+diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml
+index 485d44a..376b0f1 100644
+--- a/config/manifests/vllm/cpu-deployment.yaml
++++ b/config/manifests/vllm/cpu-deployment.yaml
+@@ -14,8 +14,8 @@ spec:
+     spec:
+       containers:
+         - name: lora
+-          image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.1" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
+-          imagePullPolicy: Always
++          image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.2" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
++          imagePullPolicy: IfNotPresent
+           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+           args:
+           - "--model"
+diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
+index 16f9388..5664df0 100644
+--- a/config/manifests/vllm/gpu-deployment.yaml
++++ b/config/manifests/vllm/gpu-deployment.yaml
+@@ -14,8 +14,8 @@ spec:
+     spec:
+       containers:
+         - name: vllm
+-          image: "vllm/vllm-openai:latest"
+-          imagePullPolicy: Always
++          image: "vllm/vllm-openai:v0.9.2"
++          imagePullPolicy: IfNotPresent
+           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+           args:
+           - "--model"
+diff --git a/config/manifests/vllm/sim-deployment.yaml b/config/manifests/vllm/sim-deployment.yaml
+index 196fe86..7021db9 100644
+--- a/config/manifests/vllm/sim-deployment.yaml
++++ b/config/manifests/vllm/sim-deployment.yaml
+@@ -15,7 +15,7 @@ spec:
+       containers:
+       - name: vllm-sim
+         image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.2
+-        imagePullPolicy: Always
++        imagePullPolicy: IfNotPresent
+         args:
+         - --model
+         - meta-llama/Llama-3.1-8B-Instruct
+diff --git a/conformance/resources/manifests/manifests.yaml b/conformance/resources/manifests/manifests.yaml
+index 5fbcfdc..d1341c4 100644
+--- a/conformance/resources/manifests/manifests.yaml
++++ b/conformance/resources/manifests/manifests.yaml
+@@ -196,8 +196,8 @@ spec:
+       terminationGracePeriodSeconds: 130
+       containers:
+       - name: epp
+-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+-        imagePullPolicy: Always
++        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
++        imagePullPolicy: IfNotPresent
+         args:
+         - -poolName
+         - "primary-inference-pool"
+@@ -293,8 +293,8 @@ spec:
+       terminationGracePeriodSeconds: 130
+       containers:
+       - name: epp
+-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+-        imagePullPolicy: Always
++        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
++        imagePullPolicy: IfNotPresent
+         args:
+         - -poolName
+         - "secondary-inference-pool"
+@@ -342,7 +342,7 @@ apiVersion: v1
+ kind: ConfigMap
+ metadata:
+   name: plugins-config
+-  namespace: default
++  namespace: gateway-conformance-app-backend
+ data:
+   conformance-plugins.yaml: |
+     apiVersion: inference.networking.x-k8s.io/v1alpha1
 diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
 index 32fffc0..1aa1b85 100644
 --- a/pkg/bbr/handlers/request.go
@@ -375,6 +806,71 @@ index a580380..eb2893f 100644
  }
  
  func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
+diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
+index 590685c..6a50faa 100644
+--- a/pkg/epp/backend/metrics/metrics.go
++++ b/pkg/epp/backend/metrics/metrics.go
+@@ -37,9 +37,12 @@ const (
+ )
+ 
+ type PodMetricsClientImpl struct {
+-	MetricMapping          *MetricMapping
+-	ModelServerMetricsPort int32
+-	ModelServerMetricsPath string
++	MetricMapping            *MetricMapping
++	ModelServerMetricsPort   int32
++	ModelServerMetricsPath   string
++	ModelServerMetricsScheme string
++
++	Client *http.Client
+ }
+ 
+ // FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one.
+@@ -49,7 +52,7 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Po
+ 	if err != nil {
+ 		return nil, fmt.Errorf("failed to create request: %v", err)
+ 	}
+-	resp, err := http.DefaultClient.Do(req)
++	resp, err := p.Client.Do(req)
+ 	if err != nil {
+ 		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
+ 	}
+@@ -73,7 +76,7 @@ func (p *PodMetricsClientImpl) getMetricEndpoint(pod *backend.Pod, targetPortNum
+ 	if p.ModelServerMetricsPort == 0 {
+ 		p.ModelServerMetricsPort = targetPortNumber
+ 	}
+-	return fmt.Sprintf("http://%s:%d%s", pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
++	return fmt.Sprintf("%s://%s:%d%s", p.ModelServerMetricsScheme, pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
+ }
+ 
+ // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
+diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
+index 9f7c2b8..2dd8ca5 100644
+--- a/pkg/epp/backend/metrics/metrics_test.go
++++ b/pkg/epp/backend/metrics/metrics_test.go
+@@ -19,6 +19,7 @@ package metrics
+ import (
+ 	"context"
+ 	"errors"
++	"net/http"
+ 	"reflect"
+ 	"strconv"
+ 	"strings"
+@@ -495,7 +496,13 @@ func TestFetchMetrics(t *testing.T) {
+ 		},
+ 	}
+ 	existing := &MetricsState{}
+-	p := &PodMetricsClientImpl{ModelServerMetricsPort: 9999, ModelServerMetricsPath: "/metrics"} // No MetricMapping needed for this basic test
++	// No MetricMapping needed for this basic test
++	p := &PodMetricsClientImpl{
++		ModelServerMetricsScheme: "http",
++		ModelServerMetricsPort:   9999,
++		ModelServerMetricsPath:   "/metrics",
++		Client:                   http.DefaultClient,
++	}
+ 
+ 	_, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use
+ 	if err == nil {
 diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
 new file mode 100644
 index 0000000..b6708fa
@@ -450,6 +946,186 @@ index 0000000..b6708fa
 +	}
 +
 +}
+diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
+index 716c9f2..1b75f0f 100644
+--- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
++++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
+@@ -85,13 +85,14 @@ func (i *indexer) Get(hash BlockHash) podSet {
+ 	i.mu.RLock()
+ 	defer i.mu.RUnlock()
+ 
+-	res := podSet{}
+-	pods, ok := i.hashToPods[hash]
+-	if !ok {
+-		return res
++	pods := i.hashToPods[hash]
++	res := make(podSet, len(pods))
++	for pod := range pods {
++		// Deep copy to avoid race condition.
++		res[pod] = struct{}{}
+ 	}
+ 
+-	return pods
++	return res
+ }
+ 
+ // makeEvictionFn returns a per-pod LRU eviction callback that removes the pod from hashToPods on eviction.
+diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
+index 2409850..a151121 100644
+--- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
++++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
+@@ -41,4 +41,7 @@ func TestIndexer_AddAndGet(t *testing.T) {
+ 	// Add another entry to the cache, which should evict the first one due to max size.
+ 	i.Add([]BlockHash{BlockHash(3)}, server)
+ 	assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should still be 2 after adding an entry")
++
++	servers = i.Get(BlockHash(4))
++	assert.Empty(t, servers, "Cache should not contain non-existent hash")
+ }
+diff --git a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
+index bf3ca8d..e7ee333 100644
+--- a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
++++ b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
+@@ -20,7 +20,9 @@ import (
+ 	"context"
+ 	"encoding/json"
+ 	"fmt"
++	"math/rand"
+ 	"slices"
++	"time"
+ 
+ 	"sigs.k8s.io/controller-runtime/pkg/log"
+ 
+@@ -58,13 +60,15 @@ func NewMaxScorePicker(maxNumOfEndpoints int) *MaxScorePicker {
+ 	return &MaxScorePicker{
+ 		typedName:         plugins.TypedName{Type: MaxScorePickerType, Name: MaxScorePickerType},
+ 		maxNumOfEndpoints: maxNumOfEndpoints,
++		randomGenerator:   rand.New(rand.NewSource(time.Now().UnixNano())),
+ 	}
+ }
+ 
+ // MaxScorePicker picks pod(s) with the maximum score from the list of candidates.
+ type MaxScorePicker struct {
+ 	typedName         plugins.TypedName
+-	maxNumOfEndpoints int // maximum number of endpoints to pick
++	maxNumOfEndpoints int        // maximum number of endpoints to pick
++	randomGenerator   *rand.Rand // randomGenerator for randomly pick endpoint on tie-break
+ }
+ 
+ // WithName sets the picker's name
+@@ -83,6 +87,11 @@ func (p *MaxScorePicker) Pick(ctx context.Context, cycleState *types.CycleState,
+ 	log.FromContext(ctx).V(logutil.DEBUG).Info(fmt.Sprintf("Selecting maximum '%d' pods from %d candidates sorted by max score: %+v", p.maxNumOfEndpoints,
+ 		len(scoredPods), scoredPods))
+ 
++	// Shuffle in-place - needed for random tie break when scores are equal
++	p.randomGenerator.Shuffle(len(scoredPods), func(i, j int) {
++		scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i]
++	})
++
+ 	slices.SortStableFunc(scoredPods, func(i, j *types.ScoredPod) int { // highest score first
+ 		if i.Score > j.Score {
+ 			return -1
+diff --git a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
+index 2089ed3..2c3aceb 100644
+--- a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
++++ b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
+@@ -21,6 +21,7 @@ import (
+ 	"testing"
+ 
+ 	"github.com/google/go-cmp/cmp"
++	"github.com/google/go-cmp/cmp/cmpopts"
+ 	k8stypes "k8s.io/apimachinery/pkg/types"
+ 
+ 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
+@@ -34,10 +35,11 @@ func TestPickMaxScorePicker(t *testing.T) {
+ 	pod3 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}
+ 
+ 	tests := []struct {
+-		name   string
+-		picker framework.Picker
+-		input  []*types.ScoredPod
+-		output []types.Pod
++		name               string
++		picker             framework.Picker
++		input              []*types.ScoredPod
++		output             []types.Pod
++		tieBreakCandidates int // tie break is random, specify how many candidate with max score
+ 	}{
+ 		{
+ 			name:   "Single max score",
+@@ -63,6 +65,7 @@ func TestPickMaxScorePicker(t *testing.T) {
+ 				&types.ScoredPod{Pod: pod1, Score: 50},
+ 				&types.ScoredPod{Pod: pod2, Score: 50},
+ 			},
++			tieBreakCandidates: 2,
+ 		},
+ 		{
+ 			name:   "Multiple results sorted by highest score, more pods than needed",
+@@ -104,6 +107,7 @@ func TestPickMaxScorePicker(t *testing.T) {
+ 				&types.ScoredPod{Pod: pod3, Score: 30},
+ 				&types.ScoredPod{Pod: pod2, Score: 25},
+ 			},
++			tieBreakCandidates: 2,
+ 		},
+ 	}
+ 
+@@ -112,6 +116,19 @@ func TestPickMaxScorePicker(t *testing.T) {
+ 			result := test.picker.Pick(context.Background(), types.NewCycleState(), test.input)
+ 			got := result.TargetPods
+ 
++			if test.tieBreakCandidates > 0 {
++				testMaxScoredPods := test.output[:test.tieBreakCandidates]
++				gotMaxScoredPods := got[:test.tieBreakCandidates]
++				diff := cmp.Diff(testMaxScoredPods, gotMaxScoredPods, cmpopts.SortSlices(func(a, b types.Pod) bool {
++					return a.String() < b.String() // predictable order within the pods with equal scores
++				}))
++				if diff != "" {
++					t.Errorf("Unexpected output (-want +got): %v", diff)
++				}
++				test.output = test.output[test.tieBreakCandidates:]
++				got = got[test.tieBreakCandidates:]
++			}
++
+ 			if diff := cmp.Diff(test.output, got); diff != "" {
+ 				t.Errorf("Unexpected output (-want +got): %v", diff)
+ 			}
+diff --git a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
+index bb272f1..eb62c37 100644
+--- a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
++++ b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
+@@ -21,6 +21,7 @@ import (
+ 	"encoding/json"
+ 	"fmt"
+ 	"math/rand"
++	"time"
+ 
+ 	"sigs.k8s.io/controller-runtime/pkg/log"
+ 
+@@ -57,6 +58,7 @@ func NewRandomPicker(maxNumOfEndpoints int) *RandomPicker {
+ 	return &RandomPicker{
+ 		typedName:         plugins.TypedName{Type: RandomPickerType, Name: RandomPickerType},
+ 		maxNumOfEndpoints: maxNumOfEndpoints,
++		randomGenerator:   rand.New(rand.NewSource(time.Now().UnixNano())),
+ 	}
+ }
+ 
+@@ -64,6 +66,7 @@ func NewRandomPicker(maxNumOfEndpoints int) *RandomPicker {
+ type RandomPicker struct {
+ 	typedName         plugins.TypedName
+ 	maxNumOfEndpoints int
++	randomGenerator   *rand.Rand // randomGenerator for randomly pick endpoint on tie-break
+ }
+ 
+ // WithName sets the name of the picker.
+@@ -83,7 +86,7 @@ func (p *RandomPicker) Pick(ctx context.Context, _ *types.CycleState, scoredPods
+ 		len(scoredPods), scoredPods))
+ 
+ 	// Shuffle in-place
+-	rand.Shuffle(len(scoredPods), func(i, j int) {
++	p.randomGenerator.Shuffle(len(scoredPods), func(i, j int) {
+ 		scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i]
+ 	})
+ 
 diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
 new file mode 100644
 index 0000000..b689c00
@@ -647,7 +1323,7 @@ index 0000000..1f6a41f
 +)
 +
 +func loadDynamoConfig() {
-+	ffiNamespace = getEnvOrDefault("DYN_NAMESPACE", "vllm-agg")
++	ffiNamespace = getEnvOrDefault("DYNAMO_NAMESPACE", "vllm-agg")
 +	ffiComponent = getEnvOrDefault("DYNAMO_COMPONENT", "backend")
 +	ffiModel = getEnvOrDefault("DYNAMO_MODEL", "Qwen/Qwen3-0.6B")
 +	ffiWorkerID = getEnvInt64OrDefault("DYNAMO_WORKER_ID", 1)
@@ -911,3 +1587,194 @@ index 0000000..1f6a41f
 +	}
 +	return nil
 +}
+diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md
+index 89a384a..809fb7f 100644
+--- a/site-src/guides/inferencepool-rollout.md
++++ b/site-src/guides/inferencepool-rollout.md
+@@ -177,7 +177,6 @@ spec:
+       terminationGracePeriodSeconds: 130
+       nodeSelector:
+         cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
+-
+       volumes:
+         - name: data
+           emptyDir: {}
+@@ -250,40 +249,133 @@ spec:
+     spec:
+       terminationGracePeriodSeconds: 130
+       containers:
+-        - name: epp
+-          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
+-          imagePullPolicy: Always
+-          args:
+-            - -poolName
+-            - "vllm-llama3-8b-instruct-new"
+-            - "-poolNamespace"
+-            - "default"
+-            - -v
+-            - "4"
+-            - --zap-encoder
+-            - "json"
+-            - -grpcPort
+-            - "9002"
+-            - -grpcHealthPort
+-            - "9003"
+-          ports:
+-            - containerPort: 9002
+-            - containerPort: 9003
+-            - name: metrics
+-              containerPort: 9090
+-          livenessProbe:
+-            grpc:
+-              port: 9003
+-              service: inference-extension
+-            initialDelaySeconds: 5
+-            periodSeconds: 10
+-          readinessProbe:
+-            grpc:
+-              port: 9003
+-              service: inference-extension
+-            initialDelaySeconds: 5
+-            periodSeconds: 10
+-  EOF
++      - name: epp
++        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
++        imagePullPolicy: Always
++        args:
++        - -poolName
++        - "vllm-llama3-8b-instruct-new"
++        - -poolNamespace
++        - "default"
++        - -v
++        - "4"
++        - --zap-encoder
++        - "json"
++        - -grpcPort
++        - "9002"
++        - -grpcHealthPort
++        - "9003"
++        - -configFile
++        - "/config/default-plugins.yaml"
++        ports:
++        - containerPort: 9002
++          name: grpc
++        - containerPort: 9003
++          name: grpc-health
++        - containerPort: 9090
++          name: metrics
++        livenessProbe:
++          grpc:
++            port: 9003
++            service: inference-extension
++          initialDelaySeconds: 5
++          periodSeconds: 10
++        readinessProbe:
++          grpc:
++            port: 9003
++            service: inference-extension
++          initialDelaySeconds: 5
++          periodSeconds: 10
++        volumeMounts:
++        - name: plugins-config-volume
++          mountPath: /config
++      volumes:
++      - name: plugins-config-volume
++        configMap:
++          name: plugins-config
++---
++apiVersion: v1
++kind: ConfigMap
++metadata:
++  name: plugins-config
++  namespace: default
++data:
++  default-plugins.yaml: |
++    apiVersion: inference.networking.x-k8s.io/v1alpha1
++    kind: EndpointPickerConfig
++    plugins:
++    - type: low-queue-filter
++      parameters:
++        threshold: 128
++    - type: lora-affinity-filter
++      parameters:
++        threshold: 0.999
++    - type: least-queue-filter
++    - type: least-kv-cache-filter
++    - type: decision-tree-filter
++      name: low-latency-filter
++      parameters:
++        current:
++          pluginRef: low-queue-filter
++        nextOnSuccess:
++          decisionTree:
++            current:
++              pluginRef: lora-affinity-filter
++            nextOnSuccessOrFailure:
++              decisionTree:
++                current:
++                  pluginRef: least-queue-filter
++                nextOnSuccessOrFailure:
++                  decisionTree:
++                    current:
++                      pluginRef: least-kv-cache-filter
++        nextOnFailure:
++          decisionTree:
++            current:
++              pluginRef: least-queue-filter
++            nextOnSuccessOrFailure:
++              decisionTree:
++                current:
++                  pluginRef: lora-affinity-filter
++                nextOnSuccessOrFailure:
++                  decisionTree:
++                    current:
++                      pluginRef: least-kv-cache-filter
++    - type: random-picker
++      parameters:
++        maxNumOfEndpoints: 1
++    - type: single-profile-handler
++    schedulingProfiles:
++    - name: default
++      plugins:
++      - pluginRef: low-latency-filter
++      - pluginRef: random-picker
++  plugins-v2.yaml: |
++    apiVersion: inference.networking.x-k8s.io/v1alpha1
++    kind: EndpointPickerConfig
++    plugins:
++    - type: queue-scorer
++    - type: kv-cache-scorer
++    - type: prefix-cache-scorer
++      parameters:
++        hashBlockSize: 64
++        maxPrefixBlocksToMatch: 256
++        lruCapacityPerServer: 31250
++    - type: max-score-picker
++      parameters:
++        maxNumOfEndpoints: 1
++    - type: single-profile-handler
++    schedulingProfiles:
++    - name: default
++      plugins:
++      - pluginRef: queue-scorer
++        weight: 1
++      - pluginRef: kv-cache-scorer
++        weight: 1
++      - pluginRef: prefix-cache-scorer
++        weight: 1
++      - pluginRef: max-score-picker
++EOF
+ ```
+ 
+ ### Direct traffic to the new inference pool
+diff --git a/version/version.go b/version/version.go
+index 1da42f2..1372ba8 100644
+--- a/version/version.go
++++ b/version/version.go
+@@ -18,5 +18,5 @@ package version
+ 
+ const (
+ 	// BundleVersion is the value used for labeling the version of the gateway-api-inference-extension.
+-	BundleVersion = "v0.4.0-dev"
++	BundleVersion = "v0.5.1"
+ )

From 961532954b8e69ad1fed7dd9a296153aa7bf8ec7 Mon Sep 17 00:00:00 2001
From: Anna Tchernych <atchernych@nvidia.com>
Date: Fri, 10 Oct 2025 12:16:09 -0700
Subject: [PATCH 2/3] update name

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
---
 deploy/inference-gateway/README.md           | 2 +-
 deploy/inference-gateway/build-epp-dynamo.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/deploy/inference-gateway/README.md b/deploy/inference-gateway/README.md
index ec508110e2..781ff17402 100644
--- a/deploy/inference-gateway/README.md
+++ b/deploy/inference-gateway/README.md
@@ -162,7 +162,7 @@ The script will apply a custom patch to the code with your GAIE repo and build t
 ```bash
 # Use your custom paths
 export DYNAMO_DIR=/path/to/dynamo
-export EPP_DIR=/path/to/gateway-api-inference-extension
+export GAIE_DIR=/path/to/gateway-api-inference-extension
 
 # Run the script
 cd deploy/inference-gateway
diff --git a/deploy/inference-gateway/build-epp-dynamo.sh b/deploy/inference-gateway/build-epp-dynamo.sh
index 55fc2fb898..cb7cbdfcc0 100755
--- a/deploy/inference-gateway/build-epp-dynamo.sh
+++ b/deploy/inference-gateway/build-epp-dynamo.sh
@@ -56,7 +56,7 @@ mkdir -p "${DYNAMO_INCLUDE_DIR}"
 echo "Copying files to the GAIE project..."
 cp "${HEADER_OUTPUT}" "${DYNAMO_INCLUDE_DIR}/"
 cp "${DYNAMO_DIR}/target/release/libdynamo_llm_capi.a" "${DYNAMO_LIB_DIR}/"
-cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}"
+cp "${DYNAMO_DIR}/container/Dockerfile.epp" "${GAIE_DIR}/Dockerfile.dynamo"
 
 # Verify files were copied
 if [[ ! -f "${DYNAMO_INCLUDE_DIR}/llm_engine.h" ]]; then

From a25e420ce62c67c4d613516b2aafaa1e1fb340cb Mon Sep 17 00:00:00 2001
From: Anna Tchernych <atchernych@nvidia.com>
Date: Mon, 13 Oct 2025 10:34:49 -0700
Subject: [PATCH 3/3] cleaned up the patch

Signed-off-by: Anna Tchernych <atchernych@nvidia.com>
---
 container/Dockerfile.epp                      |   2 -
 .../epp-v0.5.1-2/epp-v0.5.1-dyn2.patch        | 939 ------------------
 2 files changed, 941 deletions(-)

diff --git a/container/Dockerfile.epp b/container/Dockerfile.epp
index d7d7a6c2d9..977f1bbf78 100644
--- a/container/Dockerfile.epp
+++ b/container/Dockerfile.epp
@@ -12,8 +12,6 @@ ARG BASE_IMAGE=ubuntu:22.04
 FROM ${BUILDER_IMAGE} AS builder
 
 ENV CGO_ENABLED=1
-ENV GOOS=linux
-ENV GOARCH=amd64
 # be explicit; helps cgo when linking libstdc++
 ENV CC=gcc
 ENV CXX=g++
diff --git a/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch b/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
index 91a8b09ec6..f13bce189d 100644
--- a/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
+++ b/deploy/inference-gateway/epp-patches/epp-v0.5.1-2/epp-v0.5.1-dyn2.patch
@@ -77,509 +77,6 @@ index b5e0617..8592735 100644
  	if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil {
  		os.Exit(1)
  	}
-diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go
-index f4a2c9b..692d2e4 100644
---- a/cmd/epp/runner/runner.go
-+++ b/cmd/epp/runner/runner.go
-@@ -18,8 +18,10 @@ package runner
- 
- import (
- 	"context"
-+	"crypto/tls"
- 	"flag"
- 	"fmt"
-+	"net/http"
- 	"net/http/pprof"
- 	"os"
- 
-@@ -136,7 +138,9 @@ var (
- 
- 	modelServerMetricsPort = flag.Int("modelServerMetricsPort", 0, "Port to scrape metrics from pods. "+
- 		"Default value will be set to InferencePool.Spec.TargetPortNumber if not set.")
--	modelServerMetricsPath = flag.String("modelServerMetricsPath", "/metrics", "Path to scrape metrics from pods")
-+	modelServerMetricsPath                    = flag.String("modelServerMetricsPath", "/metrics", "Path to scrape metrics from pods")
-+	modelServerMetricsScheme                  = flag.String("modelServerMetricsScheme", "http", "Scheme to scrape metrics from pods")
-+	modelServerMetricsHttpsInsecureSkipVerify = flag.Bool("modelServerMetricsHttpsInsecureSkipVerify", true, "When using 'https' scheme for 'modelServerMetricsScheme', configure 'InsecureSkipVerify' (default to true)")
- 
- 	setupLog = ctrl.Log.WithName("setup")
- )
-@@ -167,13 +171,15 @@ func (r *Runner) WithSchedulerConfig(schedulerConfig *scheduling.SchedulerConfig
- func bindEnvToFlags() {
- 	// map[ENV_VAR]flagName   – add more as needed
- 	for env, flg := range map[string]string{
--		"GRPC_PORT":                     "grpcPort",
--		"GRPC_HEALTH_PORT":              "grpcHealthPort",
--		"MODEL_SERVER_METRICS_PORT":     "modelServerMetricsPort",
--		"MODEL_SERVER_METRICS_PATH":     "modelServerMetricsPath",
--		"DESTINATION_ENDPOINT_HINT_KEY": "destinationEndpointHintKey",
--		"POOL_NAME":                     "poolName",
--		"POOL_NAMESPACE":                "poolNamespace",
-+		"GRPC_PORT":                                       "grpcPort",
-+		"GRPC_HEALTH_PORT":                                "grpcHealthPort",
-+		"MODEL_SERVER_METRICS_PORT":                       "modelServerMetricsPort",
-+		"MODEL_SERVER_METRICS_PATH":                       "modelServerMetricsPath",
-+		"DESTINATION_ENDPOINT_HINT_KEY":                   "destinationEndpointHintKey",
-+		"MODEL_SERVER_METRICS_SCHEME":                     "modelServerMetricsScheme",
-+		"MODEL_SERVER_METRICS_HTTPS_INSECURE_SKIP_VERIFY": "modelServerMetricsHttpsInsecureSkipVerify",
-+		"POOL_NAME":                                       "poolName",
-+		"POOL_NAMESPACE":                                  "poolNamespace",
- 		// durations & bools work too; flag.Set expects the *string* form
- 		"REFRESH_METRICS_INTERVAL": "refreshMetricsInterval",
- 		"SECURE_SERVING":           "secureServing",
-@@ -231,10 +237,26 @@ func (r *Runner) Run(ctx context.Context) error {
- 		return err
- 	}
- 	verifyMetricMapping(*mapping, setupLog)
-+
-+	var metricsHttpClient *http.Client
-+	if *modelServerMetricsScheme == "https" {
-+		metricsHttpClient = &http.Client{
-+			Transport: &http.Transport{
-+				TLSClientConfig: &tls.Config{
-+					InsecureSkipVerify: *modelServerMetricsHttpsInsecureSkipVerify,
-+				},
-+			},
-+		}
-+	} else {
-+		metricsHttpClient = http.DefaultClient
-+	}
-+
- 	pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{
--		MetricMapping:          mapping,
--		ModelServerMetricsPort: int32(*modelServerMetricsPort),
--		ModelServerMetricsPath: *modelServerMetricsPath,
-+		MetricMapping:            mapping,
-+		ModelServerMetricsPort:   int32(*modelServerMetricsPort),
-+		ModelServerMetricsPath:   *modelServerMetricsPath,
-+		ModelServerMetricsScheme: *modelServerMetricsScheme,
-+		Client:                   metricsHttpClient,
- 	}, *refreshMetricsInterval)
- 
- 	datastore := datastore.NewDatastore(ctx, pmf)
-@@ -348,6 +370,8 @@ func (r *Runner) parsePluginsConfiguration(ctx context.Context) error {
- 		return fmt.Errorf("failed to load the configuration - %w", err)
- 	}
- 
-+	setupLog.Info("Configuration file loaded", "config", config)
-+
- 	r.schedulerConfig, err = loader.LoadSchedulerConfig(config.SchedulingProfiles, handle)
- 	if err != nil {
- 		return fmt.Errorf("failed to create Scheduler configuration - %w", err)
-@@ -410,6 +434,9 @@ func validateFlags() error {
- 	if *configText != "" && *configFile != "" {
- 		return fmt.Errorf("both the %q and %q flags can not be set at the same time", "configText", "configFile")
- 	}
-+	if *modelServerMetricsScheme != "http" && *modelServerMetricsScheme != "https" {
-+		return fmt.Errorf("unexpected %q value for %q flag, it can only be set to 'http' or 'https'", *modelServerMetricsScheme, "model-server-metrics-scheme")
-+	}
- 
- 	return nil
- }
-diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml
-index 0b88dc4..caccbc9 100644
---- a/config/charts/body-based-routing/values.yaml
-+++ b/config/charts/body-based-routing/values.yaml
-@@ -3,8 +3,8 @@ bbr:
-   replicas: 1
-   image:
-     name: bbr
--    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
--    tag: main
-+    hub: registry.k8s.io/gateway-api-inference-extension
-+    tag: v0.5.1
-     pullPolicy: Always
-   port: 9004
-   healthCheckPort: 9005
-diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md
-index bed4f33..b8a8d0a 100644
---- a/config/charts/inferencepool/README.md
-+++ b/config/charts/inferencepool/README.md
-@@ -24,26 +24,44 @@ Note that the provider name is needed to deploy provider-specific resources. If
- 
- ### Install with Custom Environment Variables
- 
--To set custom environment variables for the EndpointPicker deployment:
-+To set custom environment variables for the EndpointPicker deployment, you can define them as free-form YAML in the `values.yaml` file:
-+
-+```yaml
-+inferenceExtension:
-+  env:
-+    - name: FEATURE_FLAG_ENABLED
-+      value: "true"
-+    - name: CUSTOM_ENV_VAR
-+      value: "custom_value"
-+    - name: POD_IP
-+      valueFrom:
-+        fieldRef:
-+          fieldPath: status.podIP
-+```
-+
-+Then apply it with:
- 
- ```txt
--$ helm install vllm-llama3-8b-instruct \
--  --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
--  --set provider.name=[none|gke] \
--  --set inferenceExtension.env.FEATURE_FLAG_ENABLED=true \
--  oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0
-+$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
- ```
- 
--Alternatively, you can define environment variables in a values file:
-+### Install with Additional Ports
-+
-+To expose additional ports (e.g., for ZMQ), you can define them in the `values.yaml` file:
- 
- ```yaml
--# values.yaml
- inferenceExtension:
--  env:
--    FEATURE_FLAG_ENABLED: "true"
-+  extraContainerPorts:
-+    - name: zmq
-+      containerPort: 5557
-+      protocol: TCP
-+  extraServicePorts: # if need to expose the port for external communication
-+    - name: zmq
-+      port: 5557
-+      protocol: TCP
- ```
- 
--And apply it with:
-+Then apply it with:
- 
- ```txt
- $ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml
-@@ -84,7 +102,10 @@ The following table list the configurable parameters of the chart.
- | `inferenceExtension.image.tag`              | Image tag of the endpoint picker.                                                                                      |
- | `inferenceExtension.image.pullPolicy`       | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`.      |
- | `inferenceExtension.extProcPort`            | Port where the endpoint picker service is served for external processing. Defaults to `9002`.                          |
--| `inferenceExtension.env`                    | Map of environment variables to set in the endpoint picker container. Defaults to `{}`.                                |
-+| `inferenceExtension.env`                    | List of environment variables to set in the endpoint picker container as free-form YAML. Defaults to `[]`.             |
-+| `inferenceExtension.extraContainerPorts`    | List of additional container ports to expose. Defaults to `[]`.                                                       |
-+| `inferenceExtension.extraServicePorts`      | List of additional service ports to expose. Defaults to `[]`.                                                         |
-+| `inferenceExtension.logVerbosity`           | Logging verbosity level for the endpoint picker. Defaults to `"3"`.                                                   |
- | `provider.name`                             | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`.                   |
- 
- ## Notes
-diff --git a/config/charts/inferencepool/templates/epp-config.yaml b/config/charts/inferencepool/templates/epp-config.yaml
-new file mode 100644
-index 0000000..12cbd58
---- /dev/null
-+++ b/config/charts/inferencepool/templates/epp-config.yaml
-@@ -0,0 +1,85 @@
-+apiVersion: v1
-+kind: ConfigMap
-+metadata:
-+  name: {{ include "gateway-api-inference-extension.name" . }}
-+  namespace: {{ .Release.Namespace }}
-+data:
-+  default-plugins.yaml: |
-+    apiVersion: inference.networking.x-k8s.io/v1alpha1
-+    kind: EndpointPickerConfig
-+    plugins:
-+    - type: low-queue-filter
-+      parameters:
-+        threshold: 128
-+    - type: lora-affinity-filter
-+      parameters:
-+        threshold: 0.999
-+    - type: least-queue-filter
-+    - type: least-kv-cache-filter
-+    - type: decision-tree-filter
-+      name: low-latency-filter
-+      parameters:
-+        current:
-+          pluginRef: low-queue-filter
-+        nextOnSuccess:
-+          decisionTree:
-+            current:
-+              pluginRef: lora-affinity-filter
-+            nextOnSuccessOrFailure:
-+              decisionTree:
-+                current:
-+                  pluginRef: least-queue-filter
-+                nextOnSuccessOrFailure:
-+                  decisionTree:
-+                    current:
-+                      pluginRef: least-kv-cache-filter
-+        nextOnFailure:
-+          decisionTree:
-+            current:
-+              pluginRef: least-queue-filter
-+            nextOnSuccessOrFailure:
-+              decisionTree:
-+                current:
-+                  pluginRef: lora-affinity-filter
-+                nextOnSuccessOrFailure:
-+                  decisionTree:
-+                    current:
-+                      pluginRef: least-kv-cache-filter
-+    - type: random-picker
-+      parameters:
-+        maxNumOfEndpoints: 1
-+    - type: single-profile-handler
-+    schedulingProfiles:
-+    - name: default
-+      plugins:
-+      - pluginRef: low-latency-filter
-+      - pluginRef: random-picker
-+  plugins-v2.yaml: |
-+    apiVersion: inference.networking.x-k8s.io/v1alpha1
-+    kind: EndpointPickerConfig
-+    plugins:
-+    - type: queue-scorer
-+    - type: kv-cache-scorer
-+    - type: prefix-cache-scorer
-+      parameters:
-+        hashBlockSize: 64
-+        maxPrefixBlocksToMatch: 256
-+        lruCapacityPerServer: 31250
-+    - type: max-score-picker
-+      parameters:
-+        maxNumOfEndpoints: 1
-+    - type: single-profile-handler
-+    schedulingProfiles:
-+    - name: default
-+      plugins:
-+      - pluginRef: queue-scorer
-+        weight: 1
-+      - pluginRef: kv-cache-scorer
-+        weight: 1
-+      - pluginRef: prefix-cache-scorer
-+        weight: 1
-+      - pluginRef: max-score-picker
-+  {{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
-+  {{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
-+  {{- end }}
-+  
-diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
-index fec91e4..7edc6a3 100644
---- a/config/charts/inferencepool/templates/epp-deployment.yaml
-+++ b/config/charts/inferencepool/templates/epp-deployment.yaml
-@@ -27,16 +27,21 @@ spec:
-         - {{ .Release.Name }}
-         - -poolNamespace
-         - {{ .Release.Namespace }}
--        - -v
--        - "3"
--        - -grpcPort
-+        - --v
-+        - "{{ .Values.inferenceExtension.logVerbosity | default "3" }}"
-+        - --grpcPort
-         - "9002"
-         - -grpcHealthPort
-         - "9003"
-         - -metricsPort
-         - "9090"
-+        - -configFile
-+        - "config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
-         # https://pkg.go.dev/flag#hdr-Command_line_flag_syntax; space is only for non-bool flags
--        - "-enablePprof={{ .Values.inferenceExtension.enablePprof }}"
-+        - "--enablePprof={{ .Values.inferenceExtension.enablePprof }}"
-+        - "--modelServerMetricsPath={{ .Values.inferenceExtension.modelServerMetricsPath }}"
-+        - "--modelServerMetricsScheme={{ .Values.inferenceExtension.modelServerMetricsScheme }}"
-+        - "--modelServerMetricsHttpsInsecureSkipVerify={{ .Values.inferenceExtension.modelServerMetricsHttpsInsecureSkipVerify }}"
-         {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }}
-         - -totalQueuedRequestsMetric
-         - "nv_trt_llm_request_metrics{request_type=waiting}"
-@@ -52,6 +57,9 @@ spec:
-           containerPort: 9003
-         - name: metrics
-           containerPort: 9090
-+        {{- with .Values.inferenceExtension.extraContainerPorts }}
-+        {{- toYaml . | nindent 8 }}
-+        {{- end }}
-         livenessProbe:
-           grpc:
-             port: 9003
-@@ -64,8 +72,14 @@ spec:
-             service: inference-extension
-           initialDelaySeconds: 5
-           periodSeconds: 10
-+        {{- with .Values.inferenceExtension.env }}
-         env:
--        {{- range $key, $value := .Values.inferenceExtension.env }}
--        - name: {{ $key }}
--          value: {{ $value | quote }}
-+        {{- toYaml . | nindent 8 }}
-         {{- end }}
-+        volumeMounts:
-+        - name: plugins-config-volume
-+          mountPath: "/config"
-+      volumes:
-+      - name: plugins-config-volume
-+        configMap:
-+          name: {{ include "gateway-api-inference-extension.name" . }}
-diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml
-index ed23db1..b1a48df 100644
---- a/config/charts/inferencepool/templates/epp-service.yaml
-+++ b/config/charts/inferencepool/templates/epp-service.yaml
-@@ -15,4 +15,7 @@ spec:
-     - name: http-metrics
-       protocol: TCP
-       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
-+    {{- with .Values.inferenceExtension.extraServicePorts }}
-+    {{- toYaml . | nindent 4 }}
-+    {{- end }}
-   type: ClusterIP
-diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
-index 2b4e800..1541863 100644
---- a/config/charts/inferencepool/values.yaml
-+++ b/config/charts/inferencepool/values.yaml
-@@ -2,16 +2,44 @@ inferenceExtension:
-   replicas: 1
-   image:
-     name: epp
--    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
--    tag: main
-+    hub: registry.k8s.io/gateway-api-inference-extension
-+    tag: v0.5.1
-     pullPolicy: Always
-   extProcPort: 9002
--  env: {}
-+  env: []
-   enablePprof: true # Enable pprof handlers for profiling and debugging
-+  modelServerMetricsPath: "/metrics"
-+  modelServerMetricsScheme: "http"
-+  modelServerMetricsHttpsInsecureSkipVerify: true
-+  # This is the plugins configuration file. 
-+  pluginsConfigFile: "default-plugins.yaml"
-+  # pluginsCustomConfig:
-+  #   custom-plugins.yaml: |
-+  #     apiVersion: inference.networking.x-k8s.io/v1alpha1
-+  #     kind: EndpointPickerConfig
-+  #     plugins:
-+  #     - type: custom-scorer
-+  #       parameters:
-+  #         custom-threshold: 64
-+  #     - type: max-score-picker
-+  #     - type: single-profile-handler
-+  #     schedulingProfiles:
-+  #     - name: default
-+  #       plugins:
-+  #       - pluginRef: custom-scorer
-+  #         weight: 1
-+  #       - pluginRef: max-score-picker
-+  #         weight: 1
-+
-   # Example environment variables:
-   # env:
-   #   KV_CACHE_SCORE_WEIGHT: "1"
- 
-+  # Define additional container ports
-+  extraContainerPorts: []
-+  # Define additional service ports
-+  extraServicePorts: []
-+
- inferencePool:
-   targetPortNumber: 8000
-   modelServerType: vllm # vllm, triton-tensorrt-llm
-diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml
-index 9bb3ea1..cbe3885 100644
---- a/config/manifests/inferencepool-resources.yaml
-+++ b/config/manifests/inferencepool-resources.yaml
-@@ -1,6 +1,8 @@
--# Note: If you change this file, please also change the file used for e2e tests!
--# 
--# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml
-+# Note: If you change this file, please also change:
-+#  - ./test/testdata/inferencepool-e2e.yaml
-+#  - ./conformance/resources/manifests/manifests.yaml
-+#  - ./site-src/guides/inferencepool-rollout.md
-+---
- apiVersion: inference.networking.x-k8s.io/v1alpha2
- kind: InferencePool
- metadata:
-@@ -48,8 +50,8 @@ spec:
-       terminationGracePeriodSeconds: 130
-       containers:
-       - name: epp
--        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
--        imagePullPolicy: Always
-+        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
-+        imagePullPolicy: IfNotPresent
-         args:
-         - -poolName
-         - "vllm-llama3-8b-instruct"
-diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml
-index 485d44a..376b0f1 100644
---- a/config/manifests/vllm/cpu-deployment.yaml
-+++ b/config/manifests/vllm/cpu-deployment.yaml
-@@ -14,8 +14,8 @@ spec:
-     spec:
-       containers:
-         - name: lora
--          image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.1" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
--          imagePullPolicy: Always
-+          image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.2" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo
-+          imagePullPolicy: IfNotPresent
-           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-           args:
-           - "--model"
-diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
-index 16f9388..5664df0 100644
---- a/config/manifests/vllm/gpu-deployment.yaml
-+++ b/config/manifests/vllm/gpu-deployment.yaml
-@@ -14,8 +14,8 @@ spec:
-     spec:
-       containers:
-         - name: vllm
--          image: "vllm/vllm-openai:latest"
--          imagePullPolicy: Always
-+          image: "vllm/vllm-openai:v0.9.2"
-+          imagePullPolicy: IfNotPresent
-           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-           args:
-           - "--model"
-diff --git a/config/manifests/vllm/sim-deployment.yaml b/config/manifests/vllm/sim-deployment.yaml
-index 196fe86..7021db9 100644
---- a/config/manifests/vllm/sim-deployment.yaml
-+++ b/config/manifests/vllm/sim-deployment.yaml
-@@ -15,7 +15,7 @@ spec:
-       containers:
-       - name: vllm-sim
-         image: ghcr.io/llm-d/llm-d-inference-sim:v0.1.2
--        imagePullPolicy: Always
-+        imagePullPolicy: IfNotPresent
-         args:
-         - --model
-         - meta-llama/Llama-3.1-8B-Instruct
-diff --git a/conformance/resources/manifests/manifests.yaml b/conformance/resources/manifests/manifests.yaml
-index 5fbcfdc..d1341c4 100644
---- a/conformance/resources/manifests/manifests.yaml
-+++ b/conformance/resources/manifests/manifests.yaml
-@@ -196,8 +196,8 @@ spec:
-       terminationGracePeriodSeconds: 130
-       containers:
-       - name: epp
--        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
--        imagePullPolicy: Always
-+        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
-+        imagePullPolicy: IfNotPresent
-         args:
-         - -poolName
-         - "primary-inference-pool"
-@@ -293,8 +293,8 @@ spec:
-       terminationGracePeriodSeconds: 130
-       containers:
-       - name: epp
--        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
--        imagePullPolicy: Always
-+        image: registry.k8s.io/gateway-api-inference-extension/epp:v0.5.1
-+        imagePullPolicy: IfNotPresent
-         args:
-         - -poolName
-         - "secondary-inference-pool"
-@@ -342,7 +342,7 @@ apiVersion: v1
- kind: ConfigMap
- metadata:
-   name: plugins-config
--  namespace: default
-+  namespace: gateway-conformance-app-backend
- data:
-   conformance-plugins.yaml: |
-     apiVersion: inference.networking.x-k8s.io/v1alpha1
 diff --git a/pkg/bbr/handlers/request.go b/pkg/bbr/handlers/request.go
 index 32fffc0..1aa1b85 100644
 --- a/pkg/bbr/handlers/request.go
@@ -806,71 +303,6 @@ index a580380..eb2893f 100644
  }
  
  func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error {
-diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go
-index 590685c..6a50faa 100644
---- a/pkg/epp/backend/metrics/metrics.go
-+++ b/pkg/epp/backend/metrics/metrics.go
-@@ -37,9 +37,12 @@ const (
- )
- 
- type PodMetricsClientImpl struct {
--	MetricMapping          *MetricMapping
--	ModelServerMetricsPort int32
--	ModelServerMetricsPath string
-+	MetricMapping            *MetricMapping
-+	ModelServerMetricsPort   int32
-+	ModelServerMetricsPath   string
-+	ModelServerMetricsScheme string
-+
-+	Client *http.Client
- }
- 
- // FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one.
-@@ -49,7 +52,7 @@ func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Po
- 	if err != nil {
- 		return nil, fmt.Errorf("failed to create request: %v", err)
- 	}
--	resp, err := http.DefaultClient.Do(req)
-+	resp, err := p.Client.Do(req)
- 	if err != nil {
- 		return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err)
- 	}
-@@ -73,7 +76,7 @@ func (p *PodMetricsClientImpl) getMetricEndpoint(pod *backend.Pod, targetPortNum
- 	if p.ModelServerMetricsPort == 0 {
- 		p.ModelServerMetricsPort = targetPortNumber
- 	}
--	return fmt.Sprintf("http://%s:%d%s", pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
-+	return fmt.Sprintf("%s://%s:%d%s", p.ModelServerMetricsScheme, pod.Address, p.ModelServerMetricsPort, p.ModelServerMetricsPath)
- }
- 
- // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics.
-diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go
-index 9f7c2b8..2dd8ca5 100644
---- a/pkg/epp/backend/metrics/metrics_test.go
-+++ b/pkg/epp/backend/metrics/metrics_test.go
-@@ -19,6 +19,7 @@ package metrics
- import (
- 	"context"
- 	"errors"
-+	"net/http"
- 	"reflect"
- 	"strconv"
- 	"strings"
-@@ -495,7 +496,13 @@ func TestFetchMetrics(t *testing.T) {
- 		},
- 	}
- 	existing := &MetricsState{}
--	p := &PodMetricsClientImpl{ModelServerMetricsPort: 9999, ModelServerMetricsPath: "/metrics"} // No MetricMapping needed for this basic test
-+	// No MetricMapping needed for this basic test
-+	p := &PodMetricsClientImpl{
-+		ModelServerMetricsScheme: "http",
-+		ModelServerMetricsPort:   9999,
-+		ModelServerMetricsPath:   "/metrics",
-+		Client:                   http.DefaultClient,
-+	}
- 
- 	_, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use
- 	if err == nil {
 diff --git a/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go b/pkg/epp/requestcontrol/plugins/dynamo_inject_workerid/plugin.go
 new file mode 100644
 index 0000000..b6708fa
@@ -946,186 +378,6 @@ index 0000000..b6708fa
 +	}
 +
 +}
-diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
-index 716c9f2..1b75f0f 100644
---- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
-+++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go
-@@ -85,13 +85,14 @@ func (i *indexer) Get(hash BlockHash) podSet {
- 	i.mu.RLock()
- 	defer i.mu.RUnlock()
- 
--	res := podSet{}
--	pods, ok := i.hashToPods[hash]
--	if !ok {
--		return res
-+	pods := i.hashToPods[hash]
-+	res := make(podSet, len(pods))
-+	for pod := range pods {
-+		// Deep copy to avoid race condition.
-+		res[pod] = struct{}{}
- 	}
- 
--	return pods
-+	return res
- }
- 
- // makeEvictionFn returns a per-pod LRU eviction callback that removes the pod from hashToPods on eviction.
-diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
-index 2409850..a151121 100644
---- a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
-+++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go
-@@ -41,4 +41,7 @@ func TestIndexer_AddAndGet(t *testing.T) {
- 	// Add another entry to the cache, which should evict the first one due to max size.
- 	i.Add([]BlockHash{BlockHash(3)}, server)
- 	assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should still be 2 after adding an entry")
-+
-+	servers = i.Get(BlockHash(4))
-+	assert.Empty(t, servers, "Cache should not contain non-existent hash")
- }
-diff --git a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
-index bf3ca8d..e7ee333 100644
---- a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
-+++ b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go
-@@ -20,7 +20,9 @@ import (
- 	"context"
- 	"encoding/json"
- 	"fmt"
-+	"math/rand"
- 	"slices"
-+	"time"
- 
- 	"sigs.k8s.io/controller-runtime/pkg/log"
- 
-@@ -58,13 +60,15 @@ func NewMaxScorePicker(maxNumOfEndpoints int) *MaxScorePicker {
- 	return &MaxScorePicker{
- 		typedName:         plugins.TypedName{Type: MaxScorePickerType, Name: MaxScorePickerType},
- 		maxNumOfEndpoints: maxNumOfEndpoints,
-+		randomGenerator:   rand.New(rand.NewSource(time.Now().UnixNano())),
- 	}
- }
- 
- // MaxScorePicker picks pod(s) with the maximum score from the list of candidates.
- type MaxScorePicker struct {
- 	typedName         plugins.TypedName
--	maxNumOfEndpoints int // maximum number of endpoints to pick
-+	maxNumOfEndpoints int        // maximum number of endpoints to pick
-+	randomGenerator   *rand.Rand // randomGenerator for randomly pick endpoint on tie-break
- }
- 
- // WithName sets the picker's name
-@@ -83,6 +87,11 @@ func (p *MaxScorePicker) Pick(ctx context.Context, cycleState *types.CycleState,
- 	log.FromContext(ctx).V(logutil.DEBUG).Info(fmt.Sprintf("Selecting maximum '%d' pods from %d candidates sorted by max score: %+v", p.maxNumOfEndpoints,
- 		len(scoredPods), scoredPods))
- 
-+	// Shuffle in-place - needed for random tie break when scores are equal
-+	p.randomGenerator.Shuffle(len(scoredPods), func(i, j int) {
-+		scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i]
-+	})
-+
- 	slices.SortStableFunc(scoredPods, func(i, j *types.ScoredPod) int { // highest score first
- 		if i.Score > j.Score {
- 			return -1
-diff --git a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
-index 2089ed3..2c3aceb 100644
---- a/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
-+++ b/pkg/epp/scheduling/framework/plugins/picker/picker_test.go
-@@ -21,6 +21,7 @@ import (
- 	"testing"
- 
- 	"github.com/google/go-cmp/cmp"
-+	"github.com/google/go-cmp/cmp/cmpopts"
- 	k8stypes "k8s.io/apimachinery/pkg/types"
- 
- 	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend"
-@@ -34,10 +35,11 @@ func TestPickMaxScorePicker(t *testing.T) {
- 	pod3 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}
- 
- 	tests := []struct {
--		name   string
--		picker framework.Picker
--		input  []*types.ScoredPod
--		output []types.Pod
-+		name               string
-+		picker             framework.Picker
-+		input              []*types.ScoredPod
-+		output             []types.Pod
-+		tieBreakCandidates int // tie break is random, specify how many candidate with max score
- 	}{
- 		{
- 			name:   "Single max score",
-@@ -63,6 +65,7 @@ func TestPickMaxScorePicker(t *testing.T) {
- 				&types.ScoredPod{Pod: pod1, Score: 50},
- 				&types.ScoredPod{Pod: pod2, Score: 50},
- 			},
-+			tieBreakCandidates: 2,
- 		},
- 		{
- 			name:   "Multiple results sorted by highest score, more pods than needed",
-@@ -104,6 +107,7 @@ func TestPickMaxScorePicker(t *testing.T) {
- 				&types.ScoredPod{Pod: pod3, Score: 30},
- 				&types.ScoredPod{Pod: pod2, Score: 25},
- 			},
-+			tieBreakCandidates: 2,
- 		},
- 	}
- 
-@@ -112,6 +116,19 @@ func TestPickMaxScorePicker(t *testing.T) {
- 			result := test.picker.Pick(context.Background(), types.NewCycleState(), test.input)
- 			got := result.TargetPods
- 
-+			if test.tieBreakCandidates > 0 {
-+				testMaxScoredPods := test.output[:test.tieBreakCandidates]
-+				gotMaxScoredPods := got[:test.tieBreakCandidates]
-+				diff := cmp.Diff(testMaxScoredPods, gotMaxScoredPods, cmpopts.SortSlices(func(a, b types.Pod) bool {
-+					return a.String() < b.String() // predictable order within the pods with equal scores
-+				}))
-+				if diff != "" {
-+					t.Errorf("Unexpected output (-want +got): %v", diff)
-+				}
-+				test.output = test.output[test.tieBreakCandidates:]
-+				got = got[test.tieBreakCandidates:]
-+			}
-+
- 			if diff := cmp.Diff(test.output, got); diff != "" {
- 				t.Errorf("Unexpected output (-want +got): %v", diff)
- 			}
-diff --git a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
-index bb272f1..eb62c37 100644
---- a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
-+++ b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go
-@@ -21,6 +21,7 @@ import (
- 	"encoding/json"
- 	"fmt"
- 	"math/rand"
-+	"time"
- 
- 	"sigs.k8s.io/controller-runtime/pkg/log"
- 
-@@ -57,6 +58,7 @@ func NewRandomPicker(maxNumOfEndpoints int) *RandomPicker {
- 	return &RandomPicker{
- 		typedName:         plugins.TypedName{Type: RandomPickerType, Name: RandomPickerType},
- 		maxNumOfEndpoints: maxNumOfEndpoints,
-+		randomGenerator:   rand.New(rand.NewSource(time.Now().UnixNano())),
- 	}
- }
- 
-@@ -64,6 +66,7 @@ func NewRandomPicker(maxNumOfEndpoints int) *RandomPicker {
- type RandomPicker struct {
- 	typedName         plugins.TypedName
- 	maxNumOfEndpoints int
-+	randomGenerator   *rand.Rand // randomGenerator for randomly pick endpoint on tie-break
- }
- 
- // WithName sets the name of the picker.
-@@ -83,7 +86,7 @@ func (p *RandomPicker) Pick(ctx context.Context, _ *types.CycleState, scoredPods
- 		len(scoredPods), scoredPods))
- 
- 	// Shuffle in-place
--	rand.Shuffle(len(scoredPods), func(i, j int) {
-+	p.randomGenerator.Shuffle(len(scoredPods), func(i, j int) {
- 		scoredPods[i], scoredPods[j] = scoredPods[j], scoredPods[i]
- 	})
- 
 diff --git a/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml b/pkg/epp/scheduling/plugins/dynamo_kv_scorer/epp-config-dynamo.yaml
 new file mode 100644
 index 0000000..b689c00
@@ -1587,194 +839,3 @@ index 0000000..1f6a41f
 +	}
 +	return nil
 +}
-diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md
-index 89a384a..809fb7f 100644
---- a/site-src/guides/inferencepool-rollout.md
-+++ b/site-src/guides/inferencepool-rollout.md
-@@ -177,7 +177,6 @@ spec:
-       terminationGracePeriodSeconds: 130
-       nodeSelector:
-         cloud.google.com/gke-accelerator: "nvidia-h100-80gb"
--
-       volumes:
-         - name: data
-           emptyDir: {}
-@@ -250,40 +249,133 @@ spec:
-     spec:
-       terminationGracePeriodSeconds: 130
-       containers:
--        - name: epp
--          image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
--          imagePullPolicy: Always
--          args:
--            - -poolName
--            - "vllm-llama3-8b-instruct-new"
--            - "-poolNamespace"
--            - "default"
--            - -v
--            - "4"
--            - --zap-encoder
--            - "json"
--            - -grpcPort
--            - "9002"
--            - -grpcHealthPort
--            - "9003"
--          ports:
--            - containerPort: 9002
--            - containerPort: 9003
--            - name: metrics
--              containerPort: 9090
--          livenessProbe:
--            grpc:
--              port: 9003
--              service: inference-extension
--            initialDelaySeconds: 5
--            periodSeconds: 10
--          readinessProbe:
--            grpc:
--              port: 9003
--              service: inference-extension
--            initialDelaySeconds: 5
--            periodSeconds: 10
--  EOF
-+      - name: epp
-+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-+        imagePullPolicy: Always
-+        args:
-+        - -poolName
-+        - "vllm-llama3-8b-instruct-new"
-+        - -poolNamespace
-+        - "default"
-+        - -v
-+        - "4"
-+        - --zap-encoder
-+        - "json"
-+        - -grpcPort
-+        - "9002"
-+        - -grpcHealthPort
-+        - "9003"
-+        - -configFile
-+        - "/config/default-plugins.yaml"
-+        ports:
-+        - containerPort: 9002
-+          name: grpc
-+        - containerPort: 9003
-+          name: grpc-health
-+        - containerPort: 9090
-+          name: metrics
-+        livenessProbe:
-+          grpc:
-+            port: 9003
-+            service: inference-extension
-+          initialDelaySeconds: 5
-+          periodSeconds: 10
-+        readinessProbe:
-+          grpc:
-+            port: 9003
-+            service: inference-extension
-+          initialDelaySeconds: 5
-+          periodSeconds: 10
-+        volumeMounts:
-+        - name: plugins-config-volume
-+          mountPath: /config
-+      volumes:
-+      - name: plugins-config-volume
-+        configMap:
-+          name: plugins-config
-+---
-+apiVersion: v1
-+kind: ConfigMap
-+metadata:
-+  name: plugins-config
-+  namespace: default
-+data:
-+  default-plugins.yaml: |
-+    apiVersion: inference.networking.x-k8s.io/v1alpha1
-+    kind: EndpointPickerConfig
-+    plugins:
-+    - type: low-queue-filter
-+      parameters:
-+        threshold: 128
-+    - type: lora-affinity-filter
-+      parameters:
-+        threshold: 0.999
-+    - type: least-queue-filter
-+    - type: least-kv-cache-filter
-+    - type: decision-tree-filter
-+      name: low-latency-filter
-+      parameters:
-+        current:
-+          pluginRef: low-queue-filter
-+        nextOnSuccess:
-+          decisionTree:
-+            current:
-+              pluginRef: lora-affinity-filter
-+            nextOnSuccessOrFailure:
-+              decisionTree:
-+                current:
-+                  pluginRef: least-queue-filter
-+                nextOnSuccessOrFailure:
-+                  decisionTree:
-+                    current:
-+                      pluginRef: least-kv-cache-filter
-+        nextOnFailure:
-+          decisionTree:
-+            current:
-+              pluginRef: least-queue-filter
-+            nextOnSuccessOrFailure:
-+              decisionTree:
-+                current:
-+                  pluginRef: lora-affinity-filter
-+                nextOnSuccessOrFailure:
-+                  decisionTree:
-+                    current:
-+                      pluginRef: least-kv-cache-filter
-+    - type: random-picker
-+      parameters:
-+        maxNumOfEndpoints: 1
-+    - type: single-profile-handler
-+    schedulingProfiles:
-+    - name: default
-+      plugins:
-+      - pluginRef: low-latency-filter
-+      - pluginRef: random-picker
-+  plugins-v2.yaml: |
-+    apiVersion: inference.networking.x-k8s.io/v1alpha1
-+    kind: EndpointPickerConfig
-+    plugins:
-+    - type: queue-scorer
-+    - type: kv-cache-scorer
-+    - type: prefix-cache-scorer
-+      parameters:
-+        hashBlockSize: 64
-+        maxPrefixBlocksToMatch: 256
-+        lruCapacityPerServer: 31250
-+    - type: max-score-picker
-+      parameters:
-+        maxNumOfEndpoints: 1
-+    - type: single-profile-handler
-+    schedulingProfiles:
-+    - name: default
-+      plugins:
-+      - pluginRef: queue-scorer
-+        weight: 1
-+      - pluginRef: kv-cache-scorer
-+        weight: 1
-+      - pluginRef: prefix-cache-scorer
-+        weight: 1
-+      - pluginRef: max-score-picker
-+EOF
- ```
- 
- ### Direct traffic to the new inference pool
-diff --git a/version/version.go b/version/version.go
-index 1da42f2..1372ba8 100644
---- a/version/version.go
-+++ b/version/version.go
-@@ -18,5 +18,5 @@ package version
- 
- const (
- 	// BundleVersion is the value used for labeling the version of the gateway-api-inference-extension.
--	BundleVersion = "v0.4.0-dev"
-+	BundleVersion = "v0.5.1"
- )