From d42f2fd19a9922b1fd031b8c5799fbf8df6409d4 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Tue, 20 Jan 2026 11:41:30 -0800 Subject: [PATCH 1/9] added latency predictor converage for inferencepool and added converage for epp standalone mode --- hack/verify-helm.sh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh index e9bb8dd09c..69ddef5a4f 100755 --- a/hack/verify-helm.sh +++ b/hack/verify-helm.sh @@ -22,6 +22,7 @@ declare -A test_cases_inference_pool test_cases_inference_pool["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_inference_pool["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true" # Run the install command in case this script runs from a different bash # source (such as in the verify-all script) @@ -46,5 +47,30 @@ for key in "${!test_cases_inference_pool[@]}"; do fi done +declare -A test_cases_epp_standalone +# InferencePool Helm Chart test cases +test_cases_epp_standalone["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_epp_standalone["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_epp_standalone["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true" + + +echo "Building dependencies for epp-standalone chart..." +${SCRIPT_ROOT}/bin/helm dependency build ${SCRIPT_ROOT}/config/charts/epp-standalone +if [ $? -ne 0 ]; then + echo "Helm dependency build failed." + exit 1 +fi + +# Running tests cases +echo "Running helm template command for epp-standalone chart..." +# Loop through the keys of the associative array +for key in "${!test_cases_epp_standalone[@]}"; do + echo "Running test: $key" + ${SCRIPT_ROOT}/bin/helm template ${SCRIPT_ROOT}/config/charts/epp-standalone ${test_cases_epp_standalone[$key]} --output-dir="${SCRIPT_ROOT}/bin" + if [ $? -ne 0 ]; then + echo "Helm template command failed for test: $key" + exit 1 + fi +done From f45b208a8b94315495e5cf88e3a277a7f4f1afb0 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Tue, 20 Jan 2026 11:58:52 -0800 Subject: [PATCH 2/9] fixd ci cd --- hack/verify-helm.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh index 69ddef5a4f..ca2f1ad147 100755 --- a/hack/verify-helm.sh +++ b/hack/verify-helm.sh @@ -22,7 +22,7 @@ declare -A test_cases_inference_pool test_cases_inference_pool["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" -test_cases_inference_pool["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true" +test_cases_inference_pool["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" # Run the install command in case this script runs from a different bash # source (such as in the verify-all script) @@ -50,9 +50,9 @@ done declare -A test_cases_epp_standalone # InferencePool Helm Chart test cases -test_cases_epp_standalone["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" -test_cases_epp_standalone["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" -test_cases_epp_standalone["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true" +test_cases_epp_standalone["basic"]="--set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" +test_cases_epp_standalone["gke-provider"]="--set provider.name=gke --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" +test_cases_epp_standalone["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" echo "Building dependencies for epp-standalone chart..." From 4cf13ac5a99af794242c4074ddca1ae60e554e04 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Tue, 20 Jan 2026 13:02:27 -0800 Subject: [PATCH 3/9] added existance check --- config/charts/inference-extension/templates/_gke.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/charts/inference-extension/templates/_gke.yaml b/config/charts/inference-extension/templates/_gke.yaml index 9a19e7597a..506f66e35a 100644 --- a/config/charts/inference-extension/templates/_gke.yaml +++ b/config/charts/inference-extension/templates/_gke.yaml @@ -1,5 +1,5 @@ {{- define "inference-extension.gke" -}} -{{- if eq (lower .Values.provider.name) "gke" }} +{{- if and .Values.provider (eq (lower .Values.provider.name) "gke") }} {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} {{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}} {{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}} From ac0e7cbaca96b0229f8dc5d7184a33a6622ea191 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Tue, 20 Jan 2026 13:04:07 -0800 Subject: [PATCH 4/9] added existence check --- config/charts/epp-standalone/values.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml index 3cb10dd5ec..4bdc52e97a 100644 --- a/config/charts/epp-standalone/values.yaml +++ b/config/charts/epp-standalone/values.yaml @@ -296,3 +296,13 @@ inferenceExtension: latencyPredictor: enabled: false + +# Options: ["gke"] +provider: + name: none + + # GKE-specific configuration. + # This block is only used if name is "gke". + gke: + # Set to true if the cluster is an Autopilot cluster. + autopilot: false \ No newline at end of file From 65712815c03313cc9e0e90bc6dc8e3ef29a7ed04 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Tue, 20 Jan 2026 13:25:37 -0800 Subject: [PATCH 5/9] added latency predictor value --- config/charts/epp-standalone/values.yaml | 83 +++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml index 4bdc52e97a..5b8cee153f 100644 --- a/config/charts/epp-standalone/values.yaml +++ b/config/charts/epp-standalone/values.yaml @@ -297,6 +297,86 @@ inferenceExtension: latencyPredictor: enabled: false + # Training Server Configuration + trainingServer: + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + port: 8000 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + # Prediction Server Configuration + predictionServers: + count: 10 + startPort: 8001 + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /healthz + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # EPP Environment Variables for Latency Predictor + eppEnv: + LATENCY_MAX_SAMPLE_SIZE: "10000" + + # Options: ["gke"] provider: name: none @@ -305,4 +385,5 @@ provider: # This block is only used if name is "gke". gke: # Set to true if the cluster is an Autopilot cluster. - autopilot: false \ No newline at end of file + autopilot: false + From b4207d54648fa4c94b80f6b7395e8d860a60b42c Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Wed, 21 Jan 2026 10:26:44 -0800 Subject: [PATCH 6/9] added prediction value.yaml --- config/charts/epp-standalone/values.yaml | 80 ------------------ config/charts/inference-extension/values.yaml | 80 ++++++++++++++++++ config/charts/inferencepool/Chart.yaml | 1 + config/charts/inferencepool/values.yaml | 81 +------------------ 4 files changed, 82 insertions(+), 160 deletions(-) create mode 100644 config/charts/inference-extension/values.yaml diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml index 5b8cee153f..7b3af858a2 100644 --- a/config/charts/epp-standalone/values.yaml +++ b/config/charts/epp-standalone/values.yaml @@ -297,86 +297,6 @@ inferenceExtension: latencyPredictor: enabled: false - # Training Server Configuration - trainingServer: - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-training-server - tag: latest - pullPolicy: Always - port: 8000 - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - livenessProbe: - httpGet: - path: /healthz - port: 8000 - initialDelaySeconds: 30 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8000 - initialDelaySeconds: 45 - periodSeconds: 10 - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - - # Prediction Server Configuration - predictionServers: - count: 10 - startPort: 8001 - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-prediction-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - livenessProbe: - httpGet: - path: /healthz - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # EPP Environment Variables for Latency Predictor - eppEnv: - LATENCY_MAX_SAMPLE_SIZE: "10000" - - # Options: ["gke"] provider: name: none diff --git a/config/charts/inference-extension/values.yaml b/config/charts/inference-extension/values.yaml new file mode 100644 index 0000000000..1803829ce7 --- /dev/null +++ b/config/charts/inference-extension/values.yaml @@ -0,0 +1,80 @@ +latencyPredictor: + enabled: false + # Training Server Configuration + trainingServer: + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + port: 8000 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + # Prediction Server Configuration + predictionServers: + count: 10 + startPort: 8001 + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /healthz + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # EPP Environment Variables for Latency Predictor + eppEnv: + LATENCY_MAX_SAMPLE_SIZE: "10000" diff --git a/config/charts/inferencepool/Chart.yaml b/config/charts/inferencepool/Chart.yaml index f6eadc3e21..815ba0f2df 100644 --- a/config/charts/inferencepool/Chart.yaml +++ b/config/charts/inferencepool/Chart.yaml @@ -11,3 +11,4 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + alias: inferenceExtension diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 1803e2cbaa..da0488e6cf 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -69,86 +69,7 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: - enabled: false - - # Training Server Configuration - trainingServer: - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-training-server - tag: latest - pullPolicy: Always - port: 8000 - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - livenessProbe: - httpGet: - path: /healthz - port: 8000 - initialDelaySeconds: 30 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8000 - initialDelaySeconds: 45 - periodSeconds: 10 - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - - # Prediction Server Configuration - predictionServers: - count: 10 - startPort: 8001 - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-prediction-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - livenessProbe: - httpGet: - path: /healthz - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # EPP Environment Variables for Latency Predictor - eppEnv: - LATENCY_MAX_SAMPLE_SIZE: "10000" + enabled: true inferencePool: targetPorts: From b2ba12adfcfffc2e7dafce719c0302d07fc3d692 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Wed, 21 Jan 2026 10:28:23 -0800 Subject: [PATCH 7/9] added comments --- config/charts/epp-standalone/values.yaml | 1 + config/charts/inferencepool/Chart.yaml | 1 + config/charts/inferencepool/values.yaml | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml index 7b3af858a2..9ab4b831bd 100644 --- a/config/charts/epp-standalone/values.yaml +++ b/config/charts/epp-standalone/values.yaml @@ -295,6 +295,7 @@ inferenceExtension: enabled: false latencyPredictor: + # common latencyPredictor setting exists in config/charts/inference-extension/values.yaml enabled: false # Options: ["gke"] diff --git a/config/charts/inferencepool/Chart.yaml b/config/charts/inferencepool/Chart.yaml index 815ba0f2df..55eb48c047 100644 --- a/config/charts/inferencepool/Chart.yaml +++ b/config/charts/inferencepool/Chart.yaml @@ -11,4 +11,5 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + # This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml alias: inferenceExtension diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index da0488e6cf..92d95ca29a 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -69,7 +69,8 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: - enabled: true + # common latencyPredictor setting exists in config/charts/inference-extension/values.yaml + enabled: false inferencePool: targetPorts: From 974aa3ba4f48342828b29fe357493623e87aa831 Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Wed, 21 Jan 2026 10:29:24 -0800 Subject: [PATCH 8/9] added alias in epp-standalone --- config/charts/epp-standalone/Chart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/charts/epp-standalone/Chart.yaml b/config/charts/epp-standalone/Chart.yaml index 07f84dc1f5..37ccc341e8 100644 --- a/config/charts/epp-standalone/Chart.yaml +++ b/config/charts/epp-standalone/Chart.yaml @@ -12,3 +12,4 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + alias: inferenceExtension From b7c0e8a65cc919bcf969f7d36ee316bb78586fcf Mon Sep 17 00:00:00 2001 From: Xiyue Yu Date: Wed, 21 Jan 2026 10:30:59 -0800 Subject: [PATCH 9/9] added comment --- config/charts/epp-standalone/Chart.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/charts/epp-standalone/Chart.yaml b/config/charts/epp-standalone/Chart.yaml index 37ccc341e8..1d88d71c1c 100644 --- a/config/charts/epp-standalone/Chart.yaml +++ b/config/charts/epp-standalone/Chart.yaml @@ -12,4 +12,5 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + # This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml alias: inferenceExtension