diff --git a/config/charts/epp-standalone/Chart.yaml b/config/charts/epp-standalone/Chart.yaml index 07f84dc1f5..1d88d71c1c 100644 --- a/config/charts/epp-standalone/Chart.yaml +++ b/config/charts/epp-standalone/Chart.yaml @@ -12,3 +12,5 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + # This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml + alias: inferenceExtension diff --git a/config/charts/epp-standalone/values.yaml b/config/charts/epp-standalone/values.yaml index 3cb10dd5ec..9ab4b831bd 100644 --- a/config/charts/epp-standalone/values.yaml +++ b/config/charts/epp-standalone/values.yaml @@ -295,4 +295,16 @@ inferenceExtension: enabled: false latencyPredictor: + # common latencyPredictor setting exists in config/charts/inference-extension/values.yaml enabled: false + +# Options: ["gke"] +provider: + name: none + + # GKE-specific configuration. + # This block is only used if name is "gke". + gke: + # Set to true if the cluster is an Autopilot cluster. + autopilot: false + diff --git a/config/charts/inference-extension/templates/_gke.yaml b/config/charts/inference-extension/templates/_gke.yaml index 9a19e7597a..506f66e35a 100644 --- a/config/charts/inference-extension/templates/_gke.yaml +++ b/config/charts/inference-extension/templates/_gke.yaml @@ -1,5 +1,5 @@ {{- define "inference-extension.gke" -}} -{{- if eq (lower .Values.provider.name) "gke" }} +{{- if and .Values.provider (eq (lower .Values.provider.name) "gke") }} {{- if and .Values.inferenceExtension.monitoring.prometheus.enabled .Values.inferenceExtension.monitoring.prometheus.auth.enabled }} {{- $metricsReadSA := printf "%s-metrics-reader-sa" .Release.Name -}} {{- $metricsReadSecretName := printf "%s-metrics-reader-secret" .Release.Name -}} diff --git a/config/charts/inference-extension/values.yaml b/config/charts/inference-extension/values.yaml new file mode 100644 index 0000000000..1803829ce7 --- /dev/null +++ b/config/charts/inference-extension/values.yaml @@ -0,0 +1,80 @@ +latencyPredictor: + enabled: false + # Training Server Configuration + trainingServer: + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-training-server + tag: latest + pullPolicy: Always + port: 8000 + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + volumeSize: "20Gi" + config: + LATENCY_RETRAINING_INTERVAL_SEC: "1" + LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" + LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" + LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" + LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" + LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" + LATENCY_MODEL_TYPE: "xgboost" + LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" + LATENCY_QUANTILE_ALPHA: "0.9" + + # Prediction Server Configuration + predictionServers: + count: 10 + startPort: 8001 + image: + hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars + name: latencypredictor-prediction-server + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + livenessProbe: + httpGet: + path: /healthz + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + volumeSize: "10Gi" + config: + LATENCY_MODEL_TYPE: "xgboost" + PREDICT_HOST: "0.0.0.0" + LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" + LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" + LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" + LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" + + # EPP Environment Variables for Latency Predictor + eppEnv: + LATENCY_MAX_SAMPLE_SIZE: "10000" diff --git a/config/charts/inferencepool/Chart.yaml b/config/charts/inferencepool/Chart.yaml index f6eadc3e21..55eb48c047 100644 --- a/config/charts/inferencepool/Chart.yaml +++ b/config/charts/inferencepool/Chart.yaml @@ -11,3 +11,5 @@ dependencies: - name: inference-extension version: 0.0.0 repository: "file://../inference-extension" + # This is needed to make use of the common values.yaml in ./config/charts/inference-extension/values.yaml + alias: inferenceExtension diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 1803e2cbaa..92d95ca29a 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -69,87 +69,9 @@ inferenceExtension: # Latency Predictor Configuration latencyPredictor: + # common latencyPredictor setting exists in config/charts/inference-extension/values.yaml enabled: false - # Training Server Configuration - trainingServer: - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-training-server - tag: latest - pullPolicy: Always - port: 8000 - resources: - requests: - cpu: "2000m" - memory: "4Gi" - limits: - cpu: "4000m" - memory: "8Gi" - livenessProbe: - httpGet: - path: /healthz - port: 8000 - initialDelaySeconds: 30 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8000 - initialDelaySeconds: 45 - periodSeconds: 10 - volumeSize: "20Gi" - config: - LATENCY_RETRAINING_INTERVAL_SEC: "1" - LATENCY_MIN_SAMPLES_FOR_RETRAIN: "100" - LATENCY_TTFT_MODEL_PATH: "/models/ttft.joblib" - LATENCY_TPOT_MODEL_PATH: "/models/tpot.joblib" - LATENCY_TTFT_SCALER_PATH: "/models/ttft_scaler.joblib" - LATENCY_TPOT_SCALER_PATH: "/models/tpot_scaler.joblib" - LATENCY_MODEL_TYPE: "xgboost" - LATENCY_MAX_TRAINING_DATA_SIZE_PER_BUCKET: "5000" - LATENCY_QUANTILE_ALPHA: "0.9" - - # Prediction Server Configuration - predictionServers: - count: 10 - startPort: 8001 - image: - hub: path/to/your/docker/repo # NOTE: Update with your Docker repository path for sidecars - name: latencypredictor-prediction-server - tag: latest - pullPolicy: Always - resources: - requests: - cpu: "500m" - memory: "1Gi" - limits: - cpu: "1000m" - memory: "2Gi" - livenessProbe: - httpGet: - path: /healthz - initialDelaySeconds: 15 - periodSeconds: 15 - readinessProbe: - httpGet: - path: /readyz - initialDelaySeconds: 10 - periodSeconds: 5 - failureThreshold: 10 - volumeSize: "10Gi" - config: - LATENCY_MODEL_TYPE: "xgboost" - PREDICT_HOST: "0.0.0.0" - LOCAL_TTFT_MODEL_PATH: "/server_models/ttft.joblib" - LOCAL_TPOT_MODEL_PATH: "/server_models/tpot.joblib" - LOCAL_TTFT_SCALER_PATH: "/server_models/ttft_scaler.joblib" - LOCAL_TPOT_SCALER_PATH: "/server_models/tpot_scaler.joblib" - - # EPP Environment Variables for Latency Predictor - eppEnv: - LATENCY_MAX_SAMPLE_SIZE: "10000" - inferencePool: targetPorts: - number: 8000 diff --git a/hack/verify-helm.sh b/hack/verify-helm.sh index e9bb8dd09c..ca2f1ad147 100755 --- a/hack/verify-helm.sh +++ b/hack/verify-helm.sh @@ -22,6 +22,7 @@ declare -A test_cases_inference_pool test_cases_inference_pool["basic"]="--set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["gke-provider"]="--set provider.name=gke --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" test_cases_inference_pool["multiple-replicas"]="--set inferencePool.replicas=3 --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" +test_cases_inference_pool["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferencePool.modelServers.matchLabels.app=llm-instance-gateway" # Run the install command in case this script runs from a different bash # source (such as in the verify-all script) @@ -46,5 +47,30 @@ for key in "${!test_cases_inference_pool[@]}"; do fi done +declare -A test_cases_epp_standalone +# InferencePool Helm Chart test cases +test_cases_epp_standalone["basic"]="--set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" +test_cases_epp_standalone["gke-provider"]="--set provider.name=gke --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" +test_cases_epp_standalone["latency-predictor"]="--set inferenceExtension.latencyPredictor.enabled=true --set inferenceExtension.endpointsServer.endpointSelector='app=llm-instance-gateway'" + + +echo "Building dependencies for epp-standalone chart..." +${SCRIPT_ROOT}/bin/helm dependency build ${SCRIPT_ROOT}/config/charts/epp-standalone +if [ $? -ne 0 ]; then + echo "Helm dependency build failed." + exit 1 +fi + +# Running tests cases +echo "Running helm template command for epp-standalone chart..." +# Loop through the keys of the associative array +for key in "${!test_cases_epp_standalone[@]}"; do + echo "Running test: $key" + ${SCRIPT_ROOT}/bin/helm template ${SCRIPT_ROOT}/config/charts/epp-standalone ${test_cases_epp_standalone[$key]} --output-dir="${SCRIPT_ROOT}/bin" + if [ $? -ne 0 ]; then + echo "Helm template command failed for test: $key" + exit 1 + fi +done