diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml index 3717fd6c..7b4671c2 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml @@ -15,13 +15,11 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.0.0 +version: 1.1.0 -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "2.0" +# This is the version number of the application being deployed. Keep this aligned +# with operator image MAJOR.MINOR version. +appVersion: "2.1" dependencies: - name: aws-mountpoint-s3-csi-driver diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml index 7f43c89a..7616f134 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml @@ -696,7 +696,7 @@ spec: l2CacheBackend: description: L2 cache backend type. Required when L2CacheSpec is provided. - pattern: (?i)redis + pattern: (?i)redis|tieredstorage type: string l2CacheLocalUrl: description: Provide the L2 cache URL to local storage @@ -721,6 +721,12 @@ spec: - round_robin type: string type: object + maxDeployTimeInSeconds: + default: 3600 + description: Maximum allowed time in seconds for the deployment to + complete before timing out. Defaults to 1 hour (3600 seconds) + format: int32 + type: integer metrics: description: Configuration for metrics collection and exposure properties: @@ -1617,12 +1623,6 @@ spec: - round_robin type: string type: object - maxDeployTimeInSeconds: - default: 3600 - description: Maximum allowed time in seconds for the deployment to - complete before timing out. Defaults to 1 hour (3600 seconds) - format: int32 - type: integer metrics: description: Configuration for metrics collection and exposure properties: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml index 68ea257e..4e1b5443 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml @@ -350,6 +350,349 @@ spec: type: object maxItems: 100 type: array + intelligentRoutingSpec: + description: |- + Configuration for intelligent routing + This feature is currently not supported for existing deployments. + Adding this configuration to an existing deployment will be rejected. + properties: + autoScalingSpec: + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to + scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default is + Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, + Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use for + autoscaling. Takes priority over CloudWatchTrigger if both + are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger reported + active before scaling the resource back to 0. Default 300 + seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts after + the initial creation of the ScaledObject. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale to. + Default 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale down + to. Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger on. + Default 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to + scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, + Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use for + autoscaling. Takes priority over PrometheusTrigger if both + are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before scaling + down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before scaling + up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + enabled: + default: false + description: Once set, the enabled field cannot be modified + type: boolean + routingStrategy: + default: prefixaware + enum: + - prefixaware + - kvaware + - session + - roundrobin + type: string + type: object + kvCacheSpec: + description: |- + Configuration for KV Cache specification + By default L1CacheOffloading will be enabled + properties: + cacheConfigFile: + description: KVCache configuration file path. If specified, override + other configurations provided via spec + type: string + enableL1Cache: + default: true + description: Enable CPU offloading + type: boolean + enableL2Cache: + default: false + type: boolean + l2CacheSpec: + description: Configuration for providing L2 Cache offloading + properties: + l2CacheBackend: + description: L2 cache backend type. Required when L2CacheSpec + is provided. + pattern: (?i)redis|tieredstorage + type: string + l2CacheLocalUrl: + description: Provide the L2 cache URL to local storage + type: string + type: object + type: object loadBalancer: description: Configuration for Application Load Balancer properties: @@ -477,6 +820,10 @@ spec: type: object server: properties: + acceleratorPartitionType: + description: MIG profile to use for GPU partitioning + pattern: ^mig-.*$ + type: string executionRole: description: The Amazon Resource Name (ARN) of an IAM role that will be used to deploy and manage the inference server @@ -489,6 +836,15 @@ spec: Must be one of the supported types. pattern: ^ml\..* type: string + validations: + description: Validations configuration for the server + properties: + acceleratorPartitionValidation: + default: true + description: Enable MIG validation for GPU partitioning. Default + is true. + type: boolean + type: object required: - instanceType type: object diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml index 9fe34cdb..24075cef 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml @@ -48,6 +48,94 @@ spec: # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). # seccompProfile: # type: RuntimeDefault + initContainers: + - command: + - bash + - -lc + - | + set -euo pipefail + KUBECTL="$(command -v kubectl || true)" + if [ -z "${KUBECTL}" ]; then + for p in /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/kubectl /usr/bin/kubectl /bin/kubectl; do + if [ -x "$p" ]; then KUBECTL="$p"; break; fi + done + fi + if [ -z "${KUBECTL}" ]; then + echo "kubectl not found in PATH or common locations" > /dev/termination-log + exit 2 + fi + + CHECKS="${CHECKS:-drivers crds}" + + log() { echo "$1" > /dev/termination-log; } + + require_csidriver() { + local provisioner="$1" + local friendly="$2" + + # Try with error capture so we can disambiguate RBAC vs missing + if "${KUBECTL}" get csidriver "$provisioner" >/dev/null 2>&1 || \ + "${KUBECTL}" get csidrivers.storage.k8s.io "$provisioner" >/dev/null 2>&1; then + return 0 + fi + + + # Final attempt to capture the real error + err_msg="$("${KUBECTL}" get csidriver "$provisioner" 2>&1 || true)" + [ -z "$err_msg" ] && err_msg="$("${KUBECTL}" get csidrivers.storage.k8s.io "$provisioner" 2>&1 || true)" + + if echo "$err_msg" | grep -qiE 'forbidden|permission|unauthorized|cannot.*get'; then + log "$friendly check failed: RBAC insufficient to read CSIDriver $provisioner. "${KUBECTL}" said: ${err_msg}" + exit 2 + fi + + log "$friendly not installed (missing CSIDriver $provisioner). kubectl said: ${err_msg}" + exit 1 + } + + require_crd() { + local crd="$1" + # Same idea: attempt and parse error text + if "${KUBECTL}" get crd "$crd" >/dev/null 2>&1; then + return 0 + fi + err="$("${KUBECTL}" get crd "$crd" 2>&1 || true)" + if echo "$err" | grep -qiE 'forbidden|permission|unauthorized|cannot.*get'; then + log "CRD check failed: RBAC insufficient to read $crd. "${KUBECTL}" said: ${err}" + exit 2 + fi + log "Missing required CRD: $crd. "${KUBECTL}" said: ${err}" + exit 1 + } + + # Dispatch selected checks + for c in $CHECKS; do + case "$c" in + drivers) + require_csidriver "s3.csi.aws.com" "S3 CSI driver" + require_csidriver "fsx.csi.aws.com" "FSx CSI driver" + ;; + crds) + require_crd "certificaterequests.cert-manager.io" "cert-manager CRD" + require_crd "certificates.cert-manager.io" "cert-manager CRD" + ;; + *) + log "Unknown check: $c" + exit 1 + ;; + esac + done + + log "Checks passed: $CHECKS" + exit 0 + env: + - name: CHECKS + value: "drivers crds" + image: "public.ecr.aws/bitnami/kubectl:1.30" + imagePullPolicy: Always + name: check-csi-drivers + resources: { } + terminationMessagePath: /dev/termination-log containers: - command: - /hyperpod-inference-manager @@ -93,7 +181,7 @@ spec: resources: limits: cpu: 500m - memory: 128Mi + memory: 256Mi requests: cpu: 10m memory: 64Mi @@ -125,4 +213,4 @@ spec: volumes: - name: webhook-certs secret: - secretName: webhook-server-cert \ No newline at end of file + secretName: webhook-server-cert diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml index 868b7765..878fb183 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml @@ -21,7 +21,7 @@ image: ap-southeast-4: 311141544681.dkr.ecr.ap-southeast-4.amazonaws.com ap-southeast-3: 158128612970.dkr.ecr.ap-southeast-3.amazonaws.com eu-south-2: 025050981094.dkr.ecr.eu-south-2.amazonaws.com - tag: v2.0 + tag: v2.1 pullPolicy: Always repository: hyperpodClusterArn: