diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock index b943caab..8c71acfd 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock @@ -13,9 +13,9 @@ dependencies: version: 1.13.2 - name: cert-manager repository: https://charts.jetstack.io - version: v1.18.0 + version: v1.18.2 - name: keda repository: https://kedacore.github.io/charts version: 2.17.1 -digest: sha256:5f877809dfd7c4d13b13f3de92e0824c28f80ed3abcf7c54f11764d9aeabbeba -generated: "2025-06-19T22:21:36.075156362Z" +digest: sha256:f54ece80a00cb4da98440551765d9c660a0704d6b59f4f9030a5a9e86eab4eea +generated: "2025-10-27T17:20:29.746399171Z" diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml index 48c2b979..3717fd6c 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml @@ -15,13 +15,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 1.0.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.16.0" +appVersion: "2.0" dependencies: - name: aws-mountpoint-s3-csi-driver @@ -45,7 +45,7 @@ dependencies: condition: alb.enabled - name: cert-manager alias: cert-manager - version: v1.18.0 + version: v1.18.2 repository: "https://charts.jetstack.io" condition: cert-manager.enabled - name: keda diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml index de3f762e..7f43c89a 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml @@ -1,11 +1,21 @@ ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: + cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert' controller-gen.kubebuilder.io/version: v0.16.4 name: inferenceendpointconfigs.inference.sagemaker.aws.amazon.com spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: '{{ .Values.namePrefix }}-conversion-webhook' + namespace: '{{ .Values.shortPrefix }}-system' + path: /convert + conversionReviewVersions: + - v1 group: inference.sagemaker.aws.amazon.com names: kind: InferenceEndpointConfig @@ -14,6 +24,1365 @@ spec: singular: inferenceendpointconfig scope: Namespaced versions: + - name: v1 + schema: + openAPIV3Schema: + description: InferenceEndpointConfig is the Schema for the inferenceendpointconfigs + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: InferenceEndpointConfigSpec defines the desired state of + InferenceEndpointConfig. + properties: + InitialReplicaCount: + description: |- + Number of desired pods. This is a pointer to distinguish between explicit + zero and not specified. Defaults to 1. + format: int32 + type: integer + autoScalingSpec: + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. Used + to define Stat for CloudWatch query. Default is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty response + from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use for autoscaling. + Takes priority over CloudWatchTrigger if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. Used + to define Stat for CloudWatch query. Default is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger reported + active before scaling the resource back to 0. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts after + the initial creation of the ScaledObject. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale to. Default + 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale down to. + Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger on. Default + 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + customHeaders: + description: Custom headers to include while querying the + prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use for autoscaling. + Takes priority over PrometheusTrigger if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + customHeaders: + description: Custom headers to include while querying the + prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before scaling + down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before scaling + up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + endpointName: + description: |- + Name used for Sagemaker Endpoint + Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created. + maxLength: 63 + pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$ + type: string + instanceType: + description: Instance Type to deploy the model on + pattern: ^ml\..* + type: string + intelligentRoutingSpec: + description: |- + Configuration for intelligent routing + This feature is currently not supported for existing deployments. + Adding this configuration to an existing deployment will be rejected. + properties: + autoScalingSpec: + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to + scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default is + Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, + Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use for + autoscaling. Takes priority over CloudWatchTrigger if both + are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. + Used to define Stat for CloudWatch query. Default + is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger reported + active before scaling the resource back to 0. Default 300 + seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts after + the initial creation of the ScaledObject. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale to. + Default 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale down + to. Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger on. + Default 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to + scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, + Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use for + autoscaling. Takes priority over PrometheusTrigger if both + are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric + to scale from 0 to 1. Only applicable if minReplicaCount + = 0 + type: number + customHeaders: + description: Custom headers to include while querying + the prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. + Enum: AverageValue - Uses average value of metric + per pod, Value - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during + polling interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before scaling + down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before scaling + up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + enabled: + default: false + description: Once set, the enabled field cannot be modified + type: boolean + routingStrategy: + default: prefixaware + enum: + - prefixaware + - kvaware + - session + - roundrobin + type: string + type: object + invocationEndpoint: + default: invocations + description: |- + The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. + Please fill in the path after http://:/ specific to your model server. + type: string + kvCacheSpec: + description: |- + Configuration for KV Cache specification + By default L1CacheOffloading will be enabled + properties: + cacheConfigFile: + description: KVCache configuration file path. If specified, override + other configurations provided via spec + type: string + enableL1Cache: + default: true + description: Enable CPU offloading + type: boolean + enableL2Cache: + default: false + type: boolean + l2CacheSpec: + description: Configuration for providing L2 Cache offloading + properties: + l2CacheBackend: + description: L2 cache backend type. Required when L2CacheSpec + is provided. + pattern: (?i)redis + type: string + l2CacheLocalUrl: + description: Provide the L2 cache URL to local storage + type: string + type: object + type: object + loadBalancer: + description: Configuration for Application Load Balancer + properties: + healthCheckPath: + default: /ping + description: Health check path for the ALB target group. Defaults + to /ping if not specified. + pattern: ^/.* + type: string + routingAlgorithm: + default: least_outstanding_requests + description: Routing algorithm for the ALB target group (least_oustanding_requests + or round_robin) + enum: + - least_outstanding_requests + - round_robin + type: string + type: object + metrics: + description: Configuration for metrics collection and exposure + properties: + enabled: + default: true + description: Enable metrics collection for this model deployment + type: boolean + metricsScrapeIntervalSeconds: + default: 15 + description: Scrape interval in seconds for metrics collection + from sidecar and model container. + format: int32 + maximum: 300 + minimum: 5 + type: integer + modelMetrics: + description: Configuration for model container metrics scraping + properties: + path: + default: /metrics + description: Path where the model exposes metrics + pattern: ^/[a-zA-Z0-9\-_.\/]*$ + type: string + port: + default: 8080 + description: Port where the model exposes metrics. If not + specified, a default port will be used. + format: int32 + maximum: 65535 + minimum: 1024 + type: integer + type: object + type: object + modelName: + description: Name of model that will be created on Sagemaker + maxLength: 63 + pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62} + type: string + modelSourceConfig: + properties: + fsxStorage: + properties: + dnsName: + description: FSX File System DNS Name + type: string + fileSystemId: + description: FSX File System ID + type: string + mountName: + description: FSX File System Mount Name + type: string + required: + - fileSystemId + type: object + modelLocation: + description: Specific location where the model data exists + type: string + modelSourceType: + enum: + - fsx + - s3 + type: string + prefetchEnabled: + default: false + description: In case the model seems to fit within the instance's + memory (VRAM), this option can be used to pre-fetch the model + to RAM and then the inference server will load to the GPU/CPU + device thereafter. + type: boolean + s3Storage: + properties: + bucketName: + description: S3 bucket location + type: string + region: + description: S3 bucket region + type: string + required: + - bucketName + - region + type: object + required: + - modelSourceType + type: object + modelVersion: + description: Version of the model used in creating sagemaker endpoint + type: string + replicas: + default: 1 + description: The desired number of inference server replicas. Default + 1. + format: int32 + type: integer + tags: + description: Mentions the tags to be added to the Sagemaker Endpoint + items: + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + type: array + tlsConfig: + description: Configurations for TLS + properties: + tlsCertificateOutputS3Uri: + pattern: ^s3://([^/]+)/?(.*)$ + type: string + type: object + worker: + description: Details of the worker + properties: + args: + description: Defines the Arguments to the entrypoint. + items: + type: string + type: array + command: + description: Defines the Command which is Entrypoint array. Not + executed within a shell. + items: + type: string + type: array + environmentVariables: + description: |- + List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be a + C_IDENTIFIER. + type: string + value: + description: |- + Variable references $(VAR_NAME) are expanded + using the previously defined environment variables in the container and + any service environment variables. If a variable cannot be resolved, + the reference in the input string will be unchanged. Double $$ are reduced + to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. + "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". + Escaped references will never be expanded, regardless of whether the variable + exists or not. + Defaults to "". + type: string + valueFrom: + description: Source for the environment variable's value. + Cannot be used if value is not empty. + properties: + configMapKeyRef: + description: Selects a key of a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the ConfigMap or its + key must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + fieldRef: + description: |- + Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, + spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + type: object + x-kubernetes-map-type: atomic + resourceFieldRef: + description: |- + Selects a resource of the container: only resources limits and requests + (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: + anyOf: + - type: integer + - type: string + description: Specifies the output format of the + exposed resources, defaults to "1" + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + resource: + description: 'Required: resource to select' + type: string + required: + - resource + type: object + x-kubernetes-map-type: atomic + secretKeyRef: + description: Selects a key of a secret in the pod's + namespace + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + x-kubernetes-map-type: atomic + type: object + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + image: + description: The name of the inference server image to be used + type: string + modelInvocationPort: + description: Defines the port at which the model server will listen + to the invocation requests. + properties: + containerPort: + description: Port on which the model server will be listening + format: int32 + maximum: 65535 + minimum: 1 + type: integer + name: + default: http + description: |- + This is name for the port within the deployed container where the model will listen. + This will be referred to by the Load Balancer Service. + This must be an IANA_SVC_NAME (for eg. http) and unique within the pod. + pattern: ^http$|^grpc$ + type: string + required: + - containerPort + type: object + modelVolumeMount: + description: Defines the volume where model will be loaded + properties: + mountPath: + default: /opt/ml/model + description: This is the path within the container where the + model data will be available for the inference server to + load it to GPU,CPU or other device + type: string + name: + description: Name of the model volume mount + type: string + required: + - name + type: object + resources: + description: Defines the Resources in terms of CPU, GPU, Memory + needed for the model to be deployed + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This is an alpha field and requires enabling the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + workingDir: + description: Defines the working directory of container. + type: string + required: + - image + - modelInvocationPort + - modelVolumeMount + - resources + type: object + required: + - instanceType + - modelName + - modelSourceConfig + - worker + type: object + status: + description: ModelDeploymentStatus defines the observed state of ModelDeployment + properties: + conditions: + description: Detailed conditions representing the state of the deployment + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + deploymentStatus: + description: Details of the native kubernetes deployment that hosts + the model + properties: + deploymentObjectOverallState: + description: Overall State of the Deployment Object + type: string + lastUpdated: + description: Last Update Time + format: date-time + type: string + message: + description: Message populated in the root CRD while updating + the status of underlying Deployment + type: string + name: + description: Name of the Deployment Object + type: string + reason: + description: Reason populated in the root CRD while updating the + status of underlying Deployment + type: string + status: + description: Status of the Deployment Object + properties: + availableReplicas: + description: Total number of available pods (ready for at + least minReadySeconds) targeted by this deployment. + format: int32 + type: integer + collisionCount: + description: |- + Count of hash collisions for the Deployment. The Deployment controller uses this + field as a collision avoidance mechanism when it needs to create the name for the + newest ReplicaSet. + format: int32 + type: integer + conditions: + description: Represents the latest available observations + of a deployment's current state. + items: + description: DeploymentCondition describes the state of + a deployment at a certain point. + properties: + lastTransitionTime: + description: Last time the condition transitioned from + one status to another. + format: date-time + type: string + lastUpdateTime: + description: The last time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of deployment condition. + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + observedGeneration: + description: The generation observed by the deployment controller. + format: int64 + type: integer + readyReplicas: + description: readyReplicas is the number of pods targeted + by this Deployment with a Ready Condition. + format: int32 + type: integer + replicas: + description: Total number of non-terminated pods targeted + by this deployment (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: |- + Total number of unavailable pods targeted by this deployment. This is the total number of + pods that are still required for the deployment to have 100% available capacity. They may + either be pods that are running but not yet available or pods that still have not been created. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted + by this deployment that have the desired template spec. + format: int32 + type: integer + type: object + required: + - lastUpdated + - name + type: object + endpoints: + description: EndpointStatus contains the status of SageMaker endpoints + properties: + sagemaker: + description: Status of the SageMaker endpoint + properties: + configArn: + description: The Amazon Resource Name (ARN) of the endpoint + configuration. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$) + type: string + endpointArn: + description: The Amazon Resource Name (ARN) of the SageMaker + endpoint + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$) + type: string + modelArn: + description: The ARN of the model created in SageMaker. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$) + type: string + state: + description: The current state of the SageMaker endpoint + type: string + required: + - state + type: object + type: object + metricsStatus: + description: Status of metrics collection + properties: + enabled: + description: Whether metrics collection is enabled + type: boolean + errorMessage: + description: Error message if metrics collection is in error state + type: string + metricsScrapeIntervalSeconds: + description: Scrape interval in seconds for metrics collection + from sidecar and model container. + format: int32 + type: integer + modelMetrics: + description: Status of model container metrics collection + properties: + path: + description: The path where metrics are available + type: string + port: + description: The port on which metrics are exposed + format: int32 + type: integer + type: object + state: + description: Current state of metrics collection + type: string + required: + - enabled + type: object + observedGeneration: + description: Latest generation reconciled by controller + format: int64 + type: integer + replicas: + description: The observed number of inference server replicas. + format: int32 + type: integer + selector: + description: LabelSelector for the deployment. + type: string + state: + description: Current phase of the model deployment + enum: + - DeploymentPending + - DeploymentInProgress + - DeploymentFailed + - DeploymentComplete + - DeletionPending + - DeletionInProgress + - DeletionFailed + - DeletionComplete + type: string + tlsCertificate: + description: CertificateStatus represents the status of TLS certificates + properties: + certificateARN: + description: The Amazon Resource Name (ARN) of the ACM certificate + pattern: arn:aws[a-z\-]*:acm:[a-z0-9\-]*:[0-9]{12}:certificate/.* + type: string + certificateDomainNames: + description: The certificate domain names that is attached to + the certificate + items: + type: string + type: array + certificateName: + description: The certificate name of cert manager + type: string + importedCertificates: + description: Used for tracking the imported certificates to ACM + items: + type: string + type: array + issuerName: + description: The issuer name of cert manager + type: string + lastCertExpiryTime: + description: The last certificate expiry time + format: date-time + type: string + tlsCertificateOutputS3Bucket: + description: S3 bucket that stores the certificate that needs + to be trusted + type: string + tlsCertificateS3Keys: + description: The output tls certificate S3 key that points to + the .pem file + items: + type: string + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} - name: v1alpha1 schema: openAPIV3Schema: @@ -214,6 +1583,7 @@ spec: type: object endpointName: description: |- + Name used for Sagemaker Endpoint Name of a SageMaker endpoint to be created for this InferenceEndpointConfig. The default value of empty string, when used, will skip endpoint creation. maxLength: 63 @@ -229,6 +1599,30 @@ spec: The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. Please fill in the path after http://:/ specific to your model server. type: string + loadBalancer: + description: Configuration for Application Load Balancer + properties: + healthCheckPath: + default: /ping + description: Health check path for the ALB target group. Defaults + to /ping if not specified. + pattern: ^/.* + type: string + routingAlgorithm: + default: least_outstanding_requests + description: Routing algorithm for the ALB target group (least_oustanding_requests + or round_robin) + enum: + - least_outstanding_requests + - round_robin + type: string + type: object + maxDeployTimeInSeconds: + default: 3600 + description: Maximum allowed time in seconds for the deployment to + complete before timing out. Defaults to 1 hour (3600 seconds) + format: int32 + type: integer metrics: description: Configuration for metrics collection and exposure properties: @@ -284,7 +1678,7 @@ spec: - fileSystemId type: object modelLocation: - description: Sepcific location where the model data exists + description: Specific location where the model data exists type: string modelSourceType: enum: @@ -870,7 +2264,7 @@ spec: type: object type: object served: true - storage: true + storage: false subresources: scale: labelSelectorPath: .status.selector diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml index 342de2bb..68ea257e 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml @@ -1,11 +1,21 @@ ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: + cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert' controller-gen.kubebuilder.io/version: v0.16.4 name: jumpstartmodels.inference.sagemaker.aws.amazon.com spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: '{{ .Values.namePrefix }}-conversion-webhook' + namespace: '{{ .Values.shortPrefix }}-system' + path: /convert + conversionReviewVersions: + - v1 group: inference.sagemaker.aws.amazon.com names: kind: JumpStartModel @@ -14,6 +24,780 @@ spec: singular: jumpstartmodel scope: Namespaced versions: + - name: v1 + schema: + openAPIV3Schema: + description: JumpStartModel is the Schema for the jumpstartmodels API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: JumpStartModelSpec defines the desired state of JumpStartModel. + properties: + autoScalingSpec: + properties: + cloudWatchTrigger: + description: CloudWatch metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. Used + to define Stat for CloudWatch query. Default is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty response + from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + cloudWatchTriggerList: + description: Multiple CloudWatch metric triggers to use for autoscaling. + Takes priority over CloudWatchTrigger if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for CloudWatch metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + dimensions: + description: Dimensions for Cloudwatch metrics + items: + properties: + name: + description: CloudWatch Metric dimension name + type: string + value: + description: CloudWatch Metric dimension value + type: string + required: + - name + - value + type: object + type: array + metricCollectionPeriod: + default: 300 + description: Defines the Period for CloudWatch query + format: int32 + type: integer + metricCollectionStartTime: + default: 300 + description: Defines the StartTime for CloudWatch query + format: int32 + type: integer + metricName: + description: Metric name to query for Cloudwatch trigger + type: string + metricStat: + default: Average + description: Statistics metric to be used by Trigger. Used + to define Stat for CloudWatch query. Default is Average. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + minValue: + default: 0 + description: Minimum metric value used in case of empty + response from CloudWatch. Default is 0. + type: number + name: + description: Name for the CloudWatch trigger + type: string + namespace: + description: AWS CloudWatch namespace for metric + type: string + targetValue: + description: TargetValue for CloudWatch metric + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + cooldownPeriod: + default: 300 + description: The period to wait after the last trigger reported + active before scaling the resource back to 0. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + initialCooldownPeriod: + default: 300 + description: The delay before the cooldownPeriod starts after + the initial creation of the ScaledObject. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + maxReplicaCount: + default: 5 + description: The maximum number of model pods to scale to. Default + 5. + format: int32 + minimum: 0 + type: integer + minReplicaCount: + default: 1 + description: The minimum number of model pods to scale down to. + Default 1. + format: int32 + minimum: 0 + type: integer + pollingInterval: + default: 30 + description: This is the interval to check each trigger on. Default + 30 seconds. + format: int32 + minimum: 0 + type: integer + prometheusTrigger: + description: Prometheus metric trigger to use for autoscaling + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + customHeaders: + description: Custom headers to include while querying the + prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + prometheusTriggerList: + description: Multiple Prometheus metric triggers to use for autoscaling. + Takes priority over PrometheusTrigger if both are provided. + items: + properties: + activationTargetValue: + default: 0 + description: Activation Value for Prometheus metric to scale + from 0 to 1. Only applicable if minReplicaCount = 0 + type: number + customHeaders: + description: Custom headers to include while querying the + prometheus endpoint. + type: string + metricType: + default: Average + description: 'The type of metric to be used by HPA. Enum: + AverageValue - Uses average value of metric per pod, Value + - Uses absolute metric value' + enum: + - Value + - Average + type: string + name: + description: Name for the Prometheus trigger + type: string + namespace: + description: Namespace for namespaced queries + type: string + query: + description: PromQLQuery for the metric. + type: string + serverAddress: + description: Server address for AMP workspace + pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$ + type: string + targetValue: + description: Target metric value for scaling + type: number + useCachedMetrics: + default: true + description: Enable caching of metric values during polling + interval. Default is true + type: boolean + type: object + maxItems: 100 + type: array + scaleDownStabilizationTime: + default: 300 + description: The time window to stabilize for HPA before scaling + down. Default 300 seconds. + format: int32 + minimum: 0 + type: integer + scaleUpStabilizationTime: + default: 0 + description: The time window to stabilize for HPA before scaling + up. Default 0 seconds. + format: int32 + minimum: 0 + type: integer + type: object + environmentVariables: + description: Additional environment variables to be passed to the + inference server. Limited to 100 key-value pairs. + items: + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + maxItems: 100 + type: array + loadBalancer: + description: Configuration for Application Load Balancer + properties: + healthCheckPath: + default: /ping + description: Health check path for the ALB target group. Defaults + to /ping if not specified. + pattern: ^/.* + type: string + routingAlgorithm: + default: least_outstanding_requests + description: Routing algorithm for the ALB target group (least_oustanding_requests + or round_robin) + enum: + - least_outstanding_requests + - round_robin + type: string + type: object + maxDeployTimeInSeconds: + default: 3600 + description: Maximum allowed time in seconds for the deployment to + complete before timing out. Defaults to 1 hour (3600 seconds) + format: int32 + type: integer + metrics: + description: Configuration for metrics collection and exposure + properties: + enabled: + default: true + description: Enable metrics collection for this model deployment + type: boolean + metricsScrapeIntervalSeconds: + default: 15 + description: Scrape interval in seconds for metrics collection + from sidecar and model container. + format: int32 + maximum: 300 + minimum: 5 + type: integer + modelMetrics: + description: Configuration for model container metrics scraping + properties: + path: + default: /metrics + description: Path where the model exposes metrics + pattern: ^/[a-zA-Z0-9\-_.\/]*$ + type: string + port: + default: 8080 + description: Port where the model exposes metrics. If not + specified, a default port will be used. + format: int32 + maximum: 65535 + minimum: 1024 + type: integer + type: object + type: object + model: + properties: + acceptEula: + default: false + description: For models that require a Model Access Config, specify + True or False to indicate whether model terms of use have been + accepted. + type: boolean + additionalConfigs: + items: + properties: + name: + type: string + value: + type: string + required: + - name + - value + type: object + maxItems: 10 + type: array + gatedModelDownloadRole: + description: The Amazon Resource Name (ARN) of an IAM role that + will be used to download gated model + maxLength: 2048 + minLength: 20 + pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$ + type: string + modelHubName: + default: SageMakerPublicHub + description: The name of the model hub content. Can be an ARN + or a simple name. + maxLength: 63 + pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$ + type: string + modelId: + description: The unique identifier of the model within the specified + hub (hubContentArn). + maxLength: 63 + pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$ + type: string + modelVersion: + description: The version of the model to deploy, in semantic versioning + format (e.g., 1.0.0). + maxLength: 14 + minLength: 5 + pattern: ^\d{1,4}.\d{1,4}.\d{1,4}$ + type: string + required: + - acceptEula + - modelId + type: object + replicas: + default: 1 + description: The desired number of inference server replicas. Default + 1. + format: int32 + type: integer + sageMakerEndpoint: + properties: + name: + default: "" + description: Name of sagemaker endpoint. Defaults to empty string + which represents that Sagemaker endpoint will not be created. + maxLength: 63 + pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$ + type: string + type: object + server: + properties: + executionRole: + description: The Amazon Resource Name (ARN) of an IAM role that + will be used to deploy and manage the inference server + maxLength: 2048 + minLength: 20 + pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$ + type: string + instanceType: + description: The EC2 instance type to use for the inference server. + Must be one of the supported types. + pattern: ^ml\..* + type: string + required: + - instanceType + type: object + tlsConfig: + properties: + tlsCertificateOutputS3Uri: + pattern: ^s3://([^/]+)/?(.*)$ + type: string + type: object + required: + - model + - server + type: object + status: + description: ModelDeploymentStatus defines the observed state of ModelDeployment + properties: + conditions: + description: Detailed conditions representing the state of the deployment + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + deploymentStatus: + description: Details of the native kubernetes deployment that hosts + the model + properties: + deploymentObjectOverallState: + description: Overall State of the Deployment Object + type: string + lastUpdated: + description: Last Update Time + format: date-time + type: string + message: + description: Message populated in the root CRD while updating + the status of underlying Deployment + type: string + name: + description: Name of the Deployment Object + type: string + reason: + description: Reason populated in the root CRD while updating the + status of underlying Deployment + type: string + status: + description: Status of the Deployment Object + properties: + availableReplicas: + description: Total number of available pods (ready for at + least minReadySeconds) targeted by this deployment. + format: int32 + type: integer + collisionCount: + description: |- + Count of hash collisions for the Deployment. The Deployment controller uses this + field as a collision avoidance mechanism when it needs to create the name for the + newest ReplicaSet. + format: int32 + type: integer + conditions: + description: Represents the latest available observations + of a deployment's current state. + items: + description: DeploymentCondition describes the state of + a deployment at a certain point. + properties: + lastTransitionTime: + description: Last time the condition transitioned from + one status to another. + format: date-time + type: string + lastUpdateTime: + description: The last time this condition was updated. + format: date-time + type: string + message: + description: A human readable message indicating details + about the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, + Unknown. + type: string + type: + description: Type of deployment condition. + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + observedGeneration: + description: The generation observed by the deployment controller. + format: int64 + type: integer + readyReplicas: + description: readyReplicas is the number of pods targeted + by this Deployment with a Ready Condition. + format: int32 + type: integer + replicas: + description: Total number of non-terminated pods targeted + by this deployment (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: |- + Total number of unavailable pods targeted by this deployment. This is the total number of + pods that are still required for the deployment to have 100% available capacity. They may + either be pods that are running but not yet available or pods that still have not been created. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted + by this deployment that have the desired template spec. + format: int32 + type: integer + type: object + required: + - lastUpdated + - name + type: object + endpoints: + description: EndpointStatus contains the status of SageMaker endpoints + properties: + sagemaker: + description: Status of the SageMaker endpoint + properties: + configArn: + description: The Amazon Resource Name (ARN) of the endpoint + configuration. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$) + type: string + endpointArn: + description: The Amazon Resource Name (ARN) of the SageMaker + endpoint + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$) + type: string + modelArn: + description: The ARN of the model created in SageMaker. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$) + type: string + state: + description: The current state of the SageMaker endpoint + type: string + required: + - state + type: object + type: object + metricsStatus: + description: Status of metrics collection + properties: + enabled: + description: Whether metrics collection is enabled + type: boolean + errorMessage: + description: Error message if metrics collection is in error state + type: string + metricsScrapeIntervalSeconds: + description: Scrape interval in seconds for metrics collection + from sidecar and model container. + format: int32 + type: integer + modelMetrics: + description: Status of model container metrics collection + properties: + path: + description: The path where metrics are available + type: string + port: + description: The port on which metrics are exposed + format: int32 + type: integer + type: object + state: + description: Current state of metrics collection + type: string + required: + - enabled + type: object + observedGeneration: + description: Latest generation reconciled by controller + format: int64 + type: integer + replicas: + description: The observed number of inference server replicas. + format: int32 + type: integer + selector: + description: LabelSelector for the deployment. + type: string + state: + description: Current phase of the model deployment + enum: + - DeploymentPending + - DeploymentInProgress + - DeploymentFailed + - DeploymentComplete + - DeletionPending + - DeletionInProgress + - DeletionFailed + - DeletionComplete + type: string + tlsCertificate: + description: CertificateStatus represents the status of TLS certificates + properties: + certificateARN: + description: The Amazon Resource Name (ARN) of the ACM certificate + pattern: arn:aws[a-z\-]*:acm:[a-z0-9\-]*:[0-9]{12}:certificate/.* + type: string + certificateDomainNames: + description: The certificate domain names that is attached to + the certificate + items: + type: string + type: array + certificateName: + description: The certificate name of cert manager + type: string + importedCertificates: + description: Used for tracking the imported certificates to ACM + items: + type: string + type: array + issuerName: + description: The issuer name of cert manager + type: string + lastCertExpiryTime: + description: The last certificate expiry time + format: date-time + type: string + tlsCertificateOutputS3Bucket: + description: S3 bucket that stores the certificate that needs + to be trusted + type: string + tlsCertificateS3Keys: + description: The output tls certificate S3 key that points to + the .pem file + items: + type: string + type: array + type: object + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.selector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} - name: v1alpha1 schema: openAPIV3Schema: @@ -219,6 +1003,24 @@ spec: type: object maxItems: 100 type: array + loadBalancer: + description: Configuration for Application Load Balancer + properties: + healthCheckPath: + default: /ping + description: Health check path for the ALB target group. Defaults + to /ping if not specified. + pattern: ^/.* + type: string + routingAlgorithm: + default: least_outstanding_requests + description: Routing algorithm for the ALB target group (least_oustanding_requests + or round_robin) + enum: + - least_outstanding_requests + - round_robin + type: string + type: object maxDeployTimeInSeconds: default: 3600 description: Maximum allowed time in seconds for the deployment to @@ -320,8 +1122,9 @@ spec: properties: name: default: "" - description: Name of a SageMaker endpoint to be created for this JumpStartModel. - The default value of empty string, when used, will skip endpoint creation. + description: |- + Name of a SageMaker endpoint to be created for this JumpStartModel. + The default value of empty string, when used, will skip endpoint creatio maxLength: 63 pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$ type: string @@ -642,7 +1445,7 @@ spec: type: object type: object served: true - storage: true + storage: false subresources: scale: labelSelectorPath: .status.selector diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml index 0ff9aca8..80f1c56a 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml @@ -1,11 +1,21 @@ ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: + cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert' controller-gen.kubebuilder.io/version: v0.16.4 name: sagemakerendpointregistrations.inference.sagemaker.aws.amazon.com spec: + conversion: + strategy: Webhook + webhook: + clientConfig: + service: + name: '{{ .Values.namePrefix }}-conversion-webhook' + namespace: '{{ .Values.shortPrefix }}-system' + path: /convert + conversionReviewVersions: + - v1 group: inference.sagemaker.aws.amazon.com names: kind: SageMakerEndpointRegistration @@ -14,7 +24,7 @@ spec: singular: sagemakerendpointregistration scope: Namespaced versions: - - name: v1alpha1 + - name: v1 schema: openAPIV3Schema: description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations @@ -88,6 +98,10 @@ spec: description: InstanceType is the ML compute instance type used for EndpointConfig creation type: string + invocationEndpoint: + default: invocations + description: The invocation endpoint path used by the model server + type: string loadBalancerHostName: description: Needed to embed the LB Host Name type: string @@ -248,3 +262,241 @@ spec: storage: true subresources: status: {} + - name: v1alpha1 + schema: + openAPIV3Schema: + description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations + API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: SageMakerEndpointRegistrationSpec defines the desired state + of SageMakerEndpointRegistration + properties: + eksClusterDetails: + properties: + arn: + description: Stores cluster ARN + type: string + clusterSecurityGroupId: + description: Stores ClusterSecurityGroup of the EKS Cluster + type: string + name: + description: Stores cluster name + type: string + securityGroupIds: + description: Stores AdditionalSecurityGroupIds of the EKS Cluster + items: + type: string + type: array + subnetIds: + description: Stores SubnetIDs of the EKS Cluster + items: + type: string + type: array + vpcId: + description: Stores VPC Id of the EKS Cluster + type: string + required: + - arn + - clusterSecurityGroupId + - name + - securityGroupIds + - subnetIds + - vpcId + type: object + executionRole: + description: The Amazon Resource Name (ARN) of an IAM role that will + be used to create model, endpoint config, and the endpoint + maxLength: 2048 + minLength: 20 + pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$ + type: string + imageUri: + description: The ImageUri where inference code is stored + maxLength: 255 + type: string + instanceType: + description: InstanceType is the ML compute instance type used for + EndpointConfig creation + type: string + invocationEndpoint: + default: invocations + description: The invocation endpoint path used by the model server + type: string + loadBalancerHostName: + description: Needed to embed the LB Host Name + type: string + name: + description: Name used for AWS resource creation + maxLength: 63 + pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62} + type: string + restApiId: + description: REST API Gateway identifier that proxies requests to + the HyperPod endpoint (via NLB/ALB) + type: string + tlsConfig: + properties: + tlsCertificateOutputS3Bucket: + description: S3 bucket that stores the certificate that needs + to be trusted + type: string + tlsCertificateS3Keys: + description: The output tls certificate S3 key that points to + the .pem file + items: + type: string + type: array + tlsServerNameOverride: + description: The server name override for tls certificate selection + type: string + required: + - tlsCertificateOutputS3Bucket + - tlsCertificateS3Keys + type: object + required: + - eksClusterDetails + - executionRole + - imageUri + - instanceType + - loadBalancerHostName + - name + - restApiId + - tlsConfig + type: object + status: + description: SageMakerEndpointRegistrationStatus defines the observed + state of SageMakerEndpointRegistration + properties: + conditions: + description: Detailed conditions representing the state of the deployment + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + endpoint: + description: Endpoint Metadata + properties: + arn: + description: The Amazon Resource Name (ARN) of the SageMaker endpoint + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$) + type: string + configArn: + description: The Amazon Resource Name (ARN) of the endpoint configuration. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$) + type: string + lastModifiedTime: + description: The last modified time of SageMaker endpoint. + format: date-time + type: string + modelArn: + description: The ARN of the model created in SageMaker. + pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$) + type: string + required: + - arn + - configArn + - modelArn + type: object + loadBalancer: + description: LoadBalancer Metadata + properties: + hostName: + description: Hostname of LoadBalancer + type: string + required: + - hostName + type: object + observedGeneration: + description: Latest generation reconciled by controller + format: int64 + type: integer + state: + description: Current phase of the Endpoint creation Step + enum: + - CreationInProgress + - CreationFailed + - CreationCompleted + - DeletionInProgress + - DeletionFailed + - DeletionCompleted + - UpdateInProgress + - UpdateFailed + - UpdateCompleted + type: string + required: + - state + type: object + type: object + served: true + storage: false + subresources: + status: {} diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml index daf62016..9fe34cdb 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml @@ -55,9 +55,14 @@ spec: - --metrics-bind-address=:8443 - --leader-elect - --health-probe-bind-address=:8081 + - --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs image: "{{ .Values.image.repository }}/hyperpod-inference-operator:{{ .Values.image.tag }}" imagePullPolicy: {{ .Values.image.pullPolicy }} name: manager + ports: + - containerPort: 9443 + name: webhook-server + protocol: TCP securityContext: allowPrivilegeEscalation: false capabilities: @@ -73,8 +78,10 @@ spec: httpGet: path: /healthz port: 8081 + initialDelaySeconds: 180 failureThreshold: 120 periodSeconds: 60 + timeoutSeconds: 5 readinessProbe: httpGet: path: /readyz @@ -90,6 +97,10 @@ spec: requests: cpu: 10m memory: 64Mi + volumeMounts: + - mountPath: /tmp/k8s-webhook-server/serving-certs + name: webhook-certs + readOnly: true env: - name: HYPERPOD_CLUSTER_ARN value: {{ .Values.hyperpodClusterArn }} @@ -103,5 +114,15 @@ spec: value: {{ .Values.eksClusterName }} - name: TLS_CERTIFICATE_OUTPUT_S3URI value: {{ .Values.tlsCertificateS3Bucket }} + - name: ENABLE_WEBHOOKS + value: "{{ .Values.enableWebhooks }}" + - name: CHART_VERSION + value: {{ .Chart.Version | quote }} + - name: APP_VERSION + value: {{ .Chart.AppVersion | quote }} serviceAccountName: {{ .Values.namePrefix }}-controller-manager terminationGracePeriodSeconds: 10 + volumes: + - name: webhook-certs + secret: + secretName: webhook-server-cert \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml new file mode 100644 index 00000000..d0119130 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml @@ -0,0 +1,26 @@ +# This NetworkPolicy allows ingress traffic to your webhook server running +# as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks +# will only work when applied in namespaces labeled with 'webhook: enabled' +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + labels: + app.kubernetes.io/name: {{ .Values.namePrefix }} + name: allow-webhook-traffic + namespace: {{ .Values.shortPrefix }}-system +spec: + podSelector: + matchLabels: + control-plane: {{ .Values.namePrefix }}-controller-manager + app.kubernetes.io/name: {{ .Values.namePrefix }} + policyTypes: + - Ingress + ingress: + # This allows ingress traffic from any namespace with the label webhook: enabled + - from: + - namespaceSelector: + matchLabels: + webhook: enabled # Only from namespaces with this label + ports: + - port: 443 + protocol: TCP diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml index 3e4c59f1..453d3503 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml @@ -4,7 +4,6 @@ kind: ClusterRole metadata: labels: app.kubernetes.io/name: {{ .Values.namePrefix }} - app.kubernetes.io/managed-by: kustomize name: {{ .Values.namePrefix }}-inferenceendpointconfig-editor-role namespace: {{ .Values.shortPrefix }}-system rules: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml index 4d0736cc..7485ea5a 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml @@ -4,7 +4,6 @@ kind: ClusterRole metadata: labels: app.kubernetes.io/name: {{ .Values.namePrefix }} - app.kubernetes.io/managed-by: kustomize name: {{ .Values.namePrefix }}-jumpstartmodels-editor-role namespace: {{ .Values.shortPrefix }}-system rules: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml new file mode 100644 index 00000000..97d3d0b0 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: pod-reader +rules: +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["services"] + verbs: ["get", "watch", "list"] diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml new file mode 100644 index 00000000..0d862c6a --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: pod-reader-binding +subjects: +- kind: ServiceAccount + name: pod-reader + namespace: hyperpod-inference-system +roleRef: + kind: ClusterRole + name: pod-reader + apiGroup: rbac.authorization.k8s.io diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml new file mode 100644 index 00000000..c5717f4a --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: pod-reader + namespace: hyperpod-inference-system diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml index 7009a510..298ebcc2 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml @@ -3,7 +3,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: - app.kubernetes.io/name: AWSCrescendoInferenceOperator + app.kubernetes.io/name: {{ .Values.namePrefix }} name: {{ .Values.namePrefix }}-sagemakerendpointregistration-editor-role namespace: {{ .Values.shortPrefix }}-system rules: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml index 795ae656..77472ab3 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml @@ -3,7 +3,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: - app.kubernetes.io/name: AWSCrescendoInferenceOperator + app.kubernetes.io/name: {{ .Values.namePrefix }} name: {{ .Values.namePrefix }}-sagemakerendpointregistration-viewer-role namespace: {{ .Values.shortPrefix }}-system rules: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml deleted file mode 100644 index 6ad65b64..00000000 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 -kind: ModelDeployment -metadata: - labels: - app.kubernetes.io/name: huggingface-llm-phi-3-mini-4k-instruct-app - name: modeldeployment-sample -spec: - sageMakerEndpoint: - name: sample-sagemaker-endpoint-phi-3 - model: - modelHubName: SageMakerPublicHub - modelId: huggingface-llm-phi-3-mini-4k-instruct - modelVersion: 1.2.2 - server: - instanceType: ml.g5.8xlarge - maxAutoScaleReplicas: 2 - minAutoScaleReplicas: 1 - namespace: inference-namespace - environmentVariables: - - name: SAMPLE_ENV_VAR - value: "sample_value" - maxDeployTimeInSeconds: 1800 diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml deleted file mode 100644 index 11736305..00000000 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 -kind: ModelDeployment -metadata: - labels: - app.kubernetes.io/name: huggingface-text2text-flan-t5-xl-app - name: modeldeployment-sample -spec: - sageMakerEndpoint: - name: sample-sagemaker-endpoint-t2t-flan - model: - acceptEula: true - modelHubName: SageMakerPublicHub - modelId: huggingface-text2text-flan-t5-xl - modelVersion: 1.0.0 - server: - instanceType: ml.g5.8xlarge - maxAutoScaleReplicas: 2 - minAutoScaleReplicas: 1 - namespace: inference-namespace - environmentVariables: - - name: SAMPLE_ENV_VAR - value: "sample_value" - maxDeployTimeInSeconds: 1800 diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml similarity index 90% rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml index 9214dc5a..5a065ed7 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +apiVersion: inference.sagemaker.aws.amazon.com/v1 kind: JumpStartModel metadata: labels: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml new file mode 100644 index 00000000..eb423d22 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml @@ -0,0 +1,45 @@ +apiVersion: inference.sagemaker.aws.amazon.com/v1 +kind: InferenceEndpointConfig +metadata: + name: llama-8b-intel-routing + namespace: ns-team-a +spec: + endpointName: llama-8b-sme + modelName: Llama-3.1-8B-Instruct + instanceType: ml.g5.8xlarge + invocationEndpoint: v1/chat/completions + intelligentRoutingSpec: + enabled: true + routingStrategy: prefixaware + modelSourceConfig: + modelSourceType: s3 + s3Storage: + bucketName: + region: us-west-2 + modelLocation: llama31_8b + prefetchEnabled: false + kvCacheSpec: + enableL1Cache: true + tlsConfig: # optional field, default value from operator deployment used if tlsConfig is omitted + tlsCertificateOutputS3Uri: # e.g.: s3://tls-certs-bucket/certs + worker: + resources: + limits: + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 30Gi + nvidia.com/gpu: "1" + image: lmcache/vllm-openai:v0.3.7 + args: + - "/opt/ml/model" + - "--max-model-len" + - "4096" + modelInvocationPort: + containerPort: 8000 + name: http + modelVolumeMount: + name: model-weights + mountPath: /opt/ml/model + loadBalancer: + healthCheckPath: /health \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml new file mode 100644 index 00000000..0d7e7c06 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml @@ -0,0 +1,44 @@ +apiVersion: inference.sagemaker.aws.amazon.com/v1 +kind: InferenceEndpointConfig +metadata: + name: llama-8b-kv-cache-l1-l2 + namespace: ns-team-a +spec: + endpointName: llama-8b-sme + modelName: Llama-3.1-8B-Instruct + instanceType: ml.g5.8xlarge + invocationEndpoint: v1/chat/completions + modelSourceConfig: + modelSourceType: s3 + s3Storage: + bucketName: + region: us-west-2 + modelLocation: llama31_8b + prefetchEnabled: false + kvCacheSpec: + enableL1Cache: true + enableL2Cache: true + l2CacheSpec: + l2CacheBackend: redis + l2CacheLocalUrl: # e.g.: redis://redis.ns-team-a.svc.cluster.local:6379 + tlsConfig: # optional field, default value from operator deployment used if tlsConfig is omitted + tlsCertificateOutputS3Uri: # e.g.: s3://tls-certs-bucket/certs + worker: + resources: + limits: + nvidia.com/gpu: "1" + requests: + cpu: "6" + memory: 30Gi + nvidia.com/gpu: "1" + image: lmcache/vllm-openai:v0.3.7 + args: + - "/opt/ml/model" + - "--max-model-len" + - "4096" + modelInvocationPort: + containerPort: 8000 + name: http + modelVolumeMount: + name: model-weights + mountPath: /opt/ml/model \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml similarity index 91% rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml index 85b10844..89376139 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +apiVersion: inference.sagemaker.aws.amazon.com/v1 kind: ModelDeployment metadata: labels: diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml similarity index 96% rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml index 5857cabd..e6e9c16c 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +apiVersion: inference.sagemaker.aws.amazon.com/v1 kind: InferenceEndpointConfig metadata: name: deepseeks3 diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml similarity index 96% rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml index 87f81ecb..deb7cbb4 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 +apiVersion: inference.sagemaker.aws.amazon.com/v1 kind: InferenceEndpointConfig metadata: name: testing-custom-deployment-inf diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml deleted file mode 100644 index 7420f1f1..00000000 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 -kind: ModelDeployment -metadata: - labels: - app.kubernetes.io/name: deepseek-llm-r1-distill-qwen-7b-app - name: deepseek-sample -spec: - sageMakerEndpoint: - name: deepsek7bsme - model: - modelHubName: SageMakerPublicHub - modelId: deepseek-llm-r1-distill-qwen-7b - modelVersion: 2.0.4 - server: - instanceType: ml.g5.48xlarge - maxAutoScaleReplicas: 2 - minAutoScaleReplicas: 1 - namespace: inference-namespace - environmentVariables: - - name: SAMPLE_ENV_VAR - value: "sample_value" - maxDeployTimeInSeconds: 1800 diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml deleted file mode 100644 index c9208fcb..00000000 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1 -kind: ModelDeployment -metadata: - labels: - app.kubernetes.io/name: huggingface-llm-mistral-7b-instruct-app - kueue.x-k8s.io/priority-class: real-time-inference - name: sample-mistral - namespace: ns-team-a -spec: - sageMakerEndpoint: - name: sample-sagemaker-endpoint - model: - acceptEula: true - modelHubName: SageMakerPublicHub - modelId: huggingface-llm-mistral-7b-instruct - server: - instanceType: ml.g5.8xlarge - maxAutoScaleReplicas: 2 - minAutoScaleReplicas: 1 - namespace: inference-namespace - environmentVariables: - - name: SAMPLE_ENV_VAR - value: "sample_value" - maxDeployTimeInSeconds: 1800 \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml new file mode 100644 index 00000000..52696fef --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml @@ -0,0 +1,17 @@ +# The following manifests contain a self-signed issuer CR and a certificate CR. +# More documentation can be found at https://docs.cert-manager.io +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + labels: + app.kubernetes.io/name: {{ .Values.namePrefix }} + name: serving-cert + namespace: {{ .Values.shortPrefix }}-system +spec: + dnsNames: + - {{ .Values.namePrefix }}-conversion-webhook.{{ .Values.shortPrefix }}-system.svc + - {{ .Values.namePrefix }}-conversion-webhook.{{ .Values.shortPrefix }}-system.svc.cluster.local + issuerRef: + kind: Issuer + name: selfsigned-issuer + secretName: webhook-server-cert diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml new file mode 100644 index 00000000..c5d47eb9 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml @@ -0,0 +1,12 @@ +# The following manifest contains a self-signed issuer CR. +# More information can be found at https://docs.cert-manager.io +# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes. +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + labels: + app.kubernetes.io/name: {{ .Values.namePrefix }} + name: selfsigned-issuer + namespace: {{ .Values.shortPrefix }}-system +spec: + selfSigned: {} diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml new file mode 100644 index 00000000..05f2ba95 --- /dev/null +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 # dummy change +kind: Service +metadata: + labels: + app.kubernetes.io/name: {{ .Values.namePrefix }} + name: {{ .Values.namePrefix }}-conversion-webhook + namespace: {{ .Values.shortPrefix }}-system +spec: + ports: + - port: 443 + protocol: TCP + targetPort: 9443 + selector: + control-plane: {{ .Values.namePrefix }}-controller-manager \ No newline at end of file diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml index d9a1a374..868b7765 100644 --- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml +++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml @@ -21,7 +21,7 @@ image: ap-southeast-4: 311141544681.dkr.ecr.ap-southeast-4.amazonaws.com ap-southeast-3: 158128612970.dkr.ecr.ap-southeast-3.amazonaws.com eu-south-2: 025050981094.dkr.ecr.eu-south-2.amazonaws.com - tag: v1.0.0 + tag: v2.0 pullPolicy: Always repository: hyperpodClusterArn: @@ -29,6 +29,7 @@ executionRoleArn: jumpstartGatedModelDownloadRoleArn: "" stage: "prod" tlsCertificateS3Bucket: +enableWebhooks: true s3: enabled: true @@ -62,6 +63,7 @@ fsx: alb: enabled: true + enableServiceMutatorWebhook: false clusterName: "" region: "" vpcId: "" @@ -99,3 +101,6 @@ components: metricsService: enabled: true path: "config/metrics" + webhook: + enabled: true + path: "config/webhook" diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py index 68054b98..65490521 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py @@ -9,4 +9,4 @@ # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. \ No newline at end of file +# language governing permissions and limitations under the License. diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py index 1da3df96..5fbb3832 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py @@ -10,13 +10,18 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -from hyperpod_custom_inference_template.v1_0 import model as v1 -from hyperpod_custom_inference_template.v1_0.template import TEMPLATE_CONTENT as v1_template +from hyperpod_custom_inference_template.v1_0 import model as v1_0 +from hyperpod_custom_inference_template.v1_1 import model as v1_1 +from hyperpod_custom_inference_template.v1_0.template import ( + TEMPLATE_CONTENT as v1_0_template, +) +from hyperpod_custom_inference_template.v1_1.template import ( + TEMPLATE_CONTENT as v1_1_template, +) SCHEMA_REGISTRY = { - "1.0": v1.FlatHPEndpoint, + "1.0": v1_0.FlatHPEndpoint, + "1.1": v1_1.FlatHPEndpoint, } -TEMPLATE_REGISTRY = { - "1.0": v1_template -} +TEMPLATE_REGISTRY = {"1.0": v1_0_template, "1.1": v1_1_template} diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py index 68054b98..65490521 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py @@ -9,4 +9,4 @@ # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. \ No newline at end of file +# language governing permissions and limitations under the License. diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py index 2e0e544e..1ec8b5c3 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py @@ -27,7 +27,7 @@ Worker, Dimensions, AutoScalingSpec, - CloudWatchTrigger + CloudWatchTrigger, ) from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker.hyperpod.common.config.metadata import Metadata @@ -37,12 +37,10 @@ class FlatHPEndpoint(BaseModel): model_config = ConfigDict(extra="forbid") namespace: Optional[str] = Field( - default=None, - description="Kubernetes namespace", - min_length=1 + default=None, description="Kubernetes namespace", min_length=1 ) - metadata_name: Optional[str] = Field( + metadata_name: Optional[str] = Field( None, alias="metadata_name", description="Name of the custom endpoint object", @@ -75,14 +73,15 @@ class FlatHPEndpoint(BaseModel): # metrics.* metrics_enabled: Optional[bool] = Field( - False, alias="metrics_enabled", + False, + alias="metrics_enabled", description="Enable metrics collection", ) # model_name and version model_name: str = Field( - ..., - alias="model_name", + ..., + alias="model_name", description="Name of model to create on SageMaker", min_length=1, max_length=63, @@ -100,15 +99,18 @@ class FlatHPEndpoint(BaseModel): # model_source_config.* model_source_type: Literal["fsx", "s3"] = Field( - ..., alias="model_source_type", + ..., + alias="model_source_type", description="Source type: fsx or s3", ) model_location: Optional[str] = Field( - None, alias="model_location", + None, + alias="model_location", description="Specific model data location", ) prefetch_enabled: Optional[bool] = Field( - False, alias="prefetch_enabled", + False, + alias="prefetch_enabled", description="Whether to pre-fetch model data", ) @@ -122,11 +124,12 @@ class FlatHPEndpoint(BaseModel): # worker.* image_uri: str = Field( - ..., alias="image_uri", + ..., + alias="image_uri", description="Inference server image name", ) container_port: int = Field( - ..., + ..., alias="container_port", description="Port on which the model server listens", ge=1, @@ -138,7 +141,8 @@ class FlatHPEndpoint(BaseModel): description="Path inside container for model volume", ) model_volume_mount_name: str = Field( - ..., alias="model_volume_mount_name", + ..., + alias="model_volume_mount_name", description="Name of the model volume mount", ) @@ -149,7 +153,7 @@ class FlatHPEndpoint(BaseModel): description="FSX File System DNS Name", ) fsx_file_system_id: Optional[str] = Field( - None, + None, alias="fsx_file_system_id", description="FSX File System ID", ) @@ -161,23 +165,23 @@ class FlatHPEndpoint(BaseModel): # S3Storage s3_bucket_name: Optional[str] = Field( - None, + None, alias="s3_bucket_name", description="S3 bucket location", ) s3_region: Optional[str] = Field( - None, + None, alias="s3_region", description="S3 bucket region", ) # Resources - resources_limits: Optional[Dict[str, Union[int,str]]] = Field( + resources_limits: Optional[Dict[str, Union[int, str]]] = Field( None, alias="resources_limits", description="Resource limits for the worker", ) - resources_requests: Optional[Dict[str, Union[int,str]]] = Field( + resources_requests: Optional[Dict[str, Union[int, str]]] = Field( None, alias="resources_requests", description="Resource requests for the worker", @@ -187,28 +191,25 @@ class FlatHPEndpoint(BaseModel): dimensions: Optional[Dict[str, str]] = Field( None, alias="dimensions", - description="CloudWatch Metric dimensions as key–value pairs" + description="CloudWatch Metric dimensions as key–value pairs", ) # CloudWatch Trigger metric_collection_period: Optional[int] = Field( - 300, - description="Defines the Period for CloudWatch query" + 300, description="Defines the Period for CloudWatch query" ) metric_collection_start_time: Optional[int] = Field( - 300, - description="Defines the StartTime for CloudWatch query" + 300, description="Defines the StartTime for CloudWatch query" ) metric_name: Optional[str] = Field( - None, - description="Metric name to query for CloudWatch trigger" + None, description="Metric name to query for CloudWatch trigger" ) metric_stat: Optional[str] = Field( "Average", description=( "Statistics metric to be used by Trigger. " "Defines the Stat for the CloudWatch query. Default is Average." - ) + ), ) metric_type: Optional[Literal["Value", "Average"]] = Field( "Average", @@ -216,33 +217,30 @@ class FlatHPEndpoint(BaseModel): "The type of metric to be used by HPA. " "`Average` – Uses average value per pod; " "`Value` – Uses absolute metric value." - ) + ), ) min_value: Optional[float] = Field( 0, description=( "Minimum metric value used in case of empty response " "from CloudWatch. Default is 0." - ) + ), ) cloud_watch_trigger_name: Optional[str] = Field( - None, - description="Name for the CloudWatch trigger" + None, description="Name for the CloudWatch trigger" ) cloud_watch_trigger_namespace: Optional[str] = Field( - None, - description="AWS CloudWatch namespace for the metric" + None, description="AWS CloudWatch namespace for the metric" ) target_value: Optional[float] = Field( - None, - description="Target value for the CloudWatch metric" + None, description="Target value for the CloudWatch metric" ) use_cached_metrics: Optional[bool] = Field( True, description=( "Enable caching of metric values during polling interval. " "Default is true." - ) + ), ) invocation_endpoint: Optional[str] = Field( @@ -250,21 +248,25 @@ class FlatHPEndpoint(BaseModel): description=( "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. " "Please fill in the path after http://:/ specific to your model server.", - ) + ), ) - @model_validator(mode='after') + @model_validator(mode="after") def validate_model_source_config(self): """Validate that required fields are provided based on model_source_type""" if self.model_source_type == "s3": if not self.s3_bucket_name or not self.s3_region: - raise ValueError("s3_bucket_name and s3_region are required when model_source_type is 's3'") + raise ValueError( + "s3_bucket_name and s3_region are required when model_source_type is 's3'" + ) elif self.model_source_type == "fsx": if not self.fsx_file_system_id: - raise ValueError("fsx_file_system_id is required when model_source_type is 'fsx'") + raise ValueError( + "fsx_file_system_id is required when model_source_type is 'fsx'" + ) return self - @model_validator(mode='after') + @model_validator(mode="after") def validate_name(self): if not self.metadata_name and not self.endpoint_name: raise ValueError("Either metadata_name or endpoint_name must be provided") @@ -273,21 +275,20 @@ def validate_name(self): def to_domain(self) -> HPEndpoint: if self.endpoint_name and not self.metadata_name: self.metadata_name = self.endpoint_name - + metadata = Metadata(name=self.metadata_name, namespace=self.namespace) env_vars = None if self.env: env_vars = [ - EnvironmentVariables(name=k, value=v) - for k, v in self.env.items() + EnvironmentVariables(name=k, value=v) for k, v in self.env.items() ] dim_vars: list[Dimensions] = [] if self.dimensions: for name, value in self.dimensions.items(): dim_vars.append(Dimensions(name=name, value=value)) - + cloud_watch_trigger = CloudWatchTrigger( dimensions=dim_vars, metric_collection_period=self.metric_collection_period, @@ -300,12 +301,10 @@ def to_domain(self) -> HPEndpoint: namespace=self.cloud_watch_trigger_namespace, target_value=self.target_value, use_cached_metrics=self.use_cached_metrics, - ) - - auto_scaling_spec = AutoScalingSpec( - cloud_watch_trigger = cloud_watch_trigger ) + auto_scaling_spec = AutoScalingSpec(cloud_watch_trigger=cloud_watch_trigger) + # nested metrics metrics = Metrics( enabled=self.metrics_enabled, @@ -336,7 +335,9 @@ def to_domain(self) -> HPEndpoint: fsx_storage=fsx, ) - tls = TlsConfig(tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri) + tls = TlsConfig( + tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri + ) invocation_port = ModelInvocationPort( container_port=self.container_port, @@ -368,4 +369,4 @@ def to_domain(self) -> HPEndpoint: worker=worker, invocation_endpoint=self.invocation_endpoint, auto_scaling_spec=auto_scaling_spec - ) \ No newline at end of file + ) diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py index 63b06fb0..4c981770 100644 --- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py @@ -85,4 +85,4 @@ invocationEndpoint: {{ invocation_endpoint }} -""" \ No newline at end of file +""" diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py new file mode 100644 index 00000000..65490521 --- /dev/null +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py @@ -0,0 +1,12 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py new file mode 100644 index 00000000..bc586c42 --- /dev/null +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py @@ -0,0 +1,442 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from pydantic import BaseModel, Field, model_validator, ConfigDict +from typing import Optional, List, Dict, Union, Literal + +from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( + Metrics, + FsxStorage, + S3Storage, + ModelSourceConfig, + TlsConfig, + EnvironmentVariables, + ModelInvocationPort, + ModelVolumeMount, + Resources, + Worker, + Dimensions, + AutoScalingSpec, + CloudWatchTrigger, + IntelligentRoutingSpec, + KvCacheSpec, + L2CacheSpec, +) +from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint +from sagemaker.hyperpod.common.config.metadata import Metadata + + +class FlatHPEndpoint(BaseModel): + model_config = ConfigDict(extra="forbid") + + namespace: Optional[str] = Field( + default=None, description="Kubernetes namespace", min_length=1 + ) + + metadata_name: Optional[str] = Field( + None, + alias="metadata_name", + description="Name of the custom endpoint object", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + + # endpoint_name + endpoint_name: Optional[str] = Field( + None, + alias="endpoint_name", + description="Name of SageMaker endpoint; empty string means no creation", + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + + # Environment variables map + env: Optional[Dict[str, str]] = Field( + None, + alias="env", + description="Map of environment variable names to their values", + ) + + instance_type: str = Field( + ..., + alias="instance_type", + description="EC2 instance type for the inference server", + pattern=r"^ml\..*", + ) + + # metrics.* + metrics_enabled: Optional[bool] = Field( + False, + alias="metrics_enabled", + description="Enable metrics collection", + ) + + # model_name and version + model_name: str = Field( + ..., + alias="model_name", + description="Name of model to create on SageMaker", + min_length=1, + max_length=63, + pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + ) + + model_version: Optional[str] = Field( + None, + alias="model_version", + description="Version of the model for the endpoint", + min_length=5, + max_length=14, + pattern=r"^\d{1,4}\.\d{1,4}\.\d{1,4}$", + ) + + # model_source_config.* + model_source_type: Literal["fsx", "s3"] = Field( + ..., + alias="model_source_type", + description="Source type: fsx or s3", + ) + model_location: Optional[str] = Field( + None, + alias="model_location", + description="Specific model data location", + ) + prefetch_enabled: Optional[bool] = Field( + False, + alias="prefetch_enabled", + description="Whether to pre-fetch model data", + ) + + # tls_config + tls_certificate_output_s3_uri: Optional[str] = Field( + None, + alias="tls_certificate_output_s3_uri", + description="S3 URI for TLS certificate output", + pattern=r"^s3://([^/]+)/?(.*)$", + ) + + # worker.* + image_uri: str = Field( + ..., + alias="image_uri", + description="Inference server image name", + ) + container_port: int = Field( + ..., + alias="container_port", + description="Port on which the model server listens", + ge=1, + le=65535, + ) + model_volume_mount_path: Optional[str] = Field( + "/opt/ml/model", + alias="model_volume_mount_path", + description="Path inside container for model volume", + ) + model_volume_mount_name: str = Field( + ..., + alias="model_volume_mount_name", + description="Name of the model volume mount", + ) + + # FSXStorage + fsx_dns_name: Optional[str] = Field( + None, + alias="fsx_dns_name", + description="FSX File System DNS Name", + ) + fsx_file_system_id: Optional[str] = Field( + None, + alias="fsx_file_system_id", + description="FSX File System ID", + ) + fsx_mount_name: Optional[str] = Field( + None, + alias="fsx_mount_name", + description="FSX File System Mount Name", + ) + + # S3Storage + s3_bucket_name: Optional[str] = Field( + None, + alias="s3_bucket_name", + description="S3 bucket location", + ) + s3_region: Optional[str] = Field( + None, + alias="s3_region", + description="S3 bucket region", + ) + + # Resources + resources_limits: Optional[Dict[str, Union[int, str]]] = Field( + None, + alias="resources_limits", + description="Resource limits for the worker", + ) + resources_requests: Optional[Dict[str, Union[int, str]]] = Field( + None, + alias="resources_requests", + description="Resource requests for the worker", + ) + + # Dimensions + dimensions: Optional[Dict[str, str]] = Field( + None, + alias="dimensions", + description="CloudWatch Metric dimensions as key–value pairs", + ) + + # CloudWatch Trigger + metric_collection_period: Optional[int] = Field( + 300, description="Defines the Period for CloudWatch query" + ) + metric_collection_start_time: Optional[int] = Field( + 300, description="Defines the StartTime for CloudWatch query" + ) + metric_name: Optional[str] = Field( + None, description="Metric name to query for CloudWatch trigger" + ) + metric_stat: Optional[str] = Field( + "Average", + description=( + "Statistics metric to be used by Trigger. " + "Defines the Stat for the CloudWatch query. Default is Average." + ), + ) + metric_type: Optional[Literal["Value", "Average"]] = Field( + "Average", + description=( + "The type of metric to be used by HPA. " + "`Average` – Uses average value per pod; " + "`Value` – Uses absolute metric value." + ), + ) + min_value: Optional[float] = Field( + 0, + description=( + "Minimum metric value used in case of empty response " + "from CloudWatch. Default is 0." + ), + ) + cloud_watch_trigger_name: Optional[str] = Field( + None, description="Name for the CloudWatch trigger" + ) + cloud_watch_trigger_namespace: Optional[str] = Field( + None, description="AWS CloudWatch namespace for the metric" + ) + target_value: Optional[float] = Field( + None, description="Target value for the CloudWatch metric" + ) + use_cached_metrics: Optional[bool] = Field( + True, + description=( + "Enable caching of metric values during polling interval. " + "Default is true." + ), + ) + + invocation_endpoint: Optional[str] = Field( + default="invocations", + description=( + "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. " + "Please fill in the path after http://:/ specific to your model server.", + ), + ) + + # Intelligent Routing flattened fields + intelligent_routing_enabled: Optional[bool] = Field( + None, + alias="intelligent_routing_enabled", + description="Enable intelligent routing", + ) + routing_strategy: Optional[ + Literal["prefixaware", "kvaware", "session", "roundrobin"] + ] = Field( + None, + alias="routing_strategy", + description="Routing strategy for intelligent routing", + ) + + # KV Cache flattened fields + enable_l1_cache: Optional[bool] = Field( + None, + alias="enable_l1_cache", + description="Enable L1 cache (CPU offloading)", + ) + enable_l2_cache: Optional[bool] = Field( + None, + alias="enable_l2_cache", + description="Enable L2 cache", + ) + l2_cache_backend: Optional[str] = Field( + None, + alias="l2_cache_backend", + description="L2 cache backend type", + ) + l2_cache_local_url: Optional[str] = Field( + None, + alias="l2_cache_local_url", + description="L2 cache URL to local storage", + ) + cache_config_file: Optional[str] = Field( + None, + alias="cache_config_file", + description="KV cache configuration file path", + ) + + @model_validator(mode="after") + def validate_model_source_config(self): + """Validate that required fields are provided based on model_source_type""" + if self.model_source_type == "s3": + if not self.s3_bucket_name or not self.s3_region: + raise ValueError( + "s3_bucket_name and s3_region are required when model_source_type is 's3'" + ) + elif self.model_source_type == "fsx": + if not self.fsx_file_system_id: + raise ValueError( + "fsx_file_system_id is required when model_source_type is 'fsx'" + ) + return self + + @model_validator(mode="after") + def validate_name(self): + if not self.metadata_name and not self.endpoint_name: + raise ValueError("Either metadata_name or endpoint_name must be provided") + return self + + def to_domain(self) -> HPEndpoint: + if self.endpoint_name and not self.metadata_name: + self.metadata_name = self.endpoint_name + + metadata = Metadata(name=self.metadata_name, namespace=self.namespace) + + env_vars = None + if self.env: + env_vars = [ + EnvironmentVariables(name=k, value=v) for k, v in self.env.items() + ] + + dim_vars: list[Dimensions] = [] + if self.dimensions: + for name, value in self.dimensions.items(): + dim_vars.append(Dimensions(name=name, value=value)) + + cloud_watch_trigger = CloudWatchTrigger( + dimensions=dim_vars, + metric_collection_period=self.metric_collection_period, + metric_collection_start_time=self.metric_collection_start_time, + metric_name=self.metric_name, + metric_stat=self.metric_stat, + metric_type=self.metric_type, + min_value=self.min_value, + name=self.cloud_watch_trigger_name, + namespace=self.cloud_watch_trigger_namespace, + target_value=self.target_value, + use_cached_metrics=self.use_cached_metrics, + ) + + auto_scaling_spec = AutoScalingSpec(cloud_watch_trigger=cloud_watch_trigger) + + # nested metrics + metrics = Metrics( + enabled=self.metrics_enabled, + ) + + # Validate storage choice and build nested storage config + if self.model_source_type == "s3": + s3 = S3Storage( + bucket_name=self.s3_bucket_name, + region=self.s3_region, + ) + fsx = None + elif self.model_source_type == "fsx": + fsx = FsxStorage( + dns_name=self.fsx_dns_name, + file_system_id=self.fsx_file_system_id, + mount_name=self.fsx_mount_name, + ) + s3 = None + else: + raise ValueError(f"Unsupported model_source_type: {self.model_source_type}") + + source = ModelSourceConfig( + model_location=self.model_location, + model_source_type=self.model_source_type, + prefetch_enabled=self.prefetch_enabled, + s3_storage=s3, + fsx_storage=fsx, + ) + + tls = TlsConfig( + tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri + ) + + invocation_port = ModelInvocationPort( + container_port=self.container_port, + ) + volume_mount = ModelVolumeMount( + mount_path=self.model_volume_mount_path, + name=self.model_volume_mount_name, + ) + resources = Resources( + limits=self.resources_limits, + requests=self.resources_requests, + ) + worker = Worker( + environment_variables=env_vars, + image=self.image_uri, + model_invocation_port=invocation_port, + model_volume_mount=volume_mount, + resources=resources, + ) + # Build intelligent routing spec from flattened fields + intelligent_routing_spec = None + if self.intelligent_routing_enabled is not None: + intelligent_routing_spec = IntelligentRoutingSpec( + enabled=self.intelligent_routing_enabled, + routing_strategy=self.routing_strategy, + ) + + # Build KV cache spec from flattened fields + kv_cache_spec = None + if any([self.enable_l1_cache, self.enable_l2_cache, self.cache_config_file]): + l2_cache_spec = None + if self.l2_cache_backend or self.l2_cache_local_url: + l2_cache_spec = L2CacheSpec( + l2_cache_backend=self.l2_cache_backend, + l2_cache_local_url=self.l2_cache_local_url, + ) + + kv_cache_spec = KvCacheSpec( + enable_l1_cache=self.enable_l1_cache, + enable_l2_cache=self.enable_l2_cache, + l2_cache_spec=l2_cache_spec, + cache_config_file=self.cache_config_file, + ) + + return HPEndpoint( + metadata=metadata, + endpoint_name=self.endpoint_name, + instance_type=self.instance_type, + metrics=metrics, + model_name=self.model_name, + model_source_config=source, + model_version=self.model_version, + tls_config=tls, + worker=worker, + invocation_endpoint=self.invocation_endpoint, + auto_scaling_spec=auto_scaling_spec, + intelligent_routing_spec=intelligent_routing_spec, + kv_cache_spec=kv_cache_spec, + ) diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json new file mode 100644 index 00000000..89af6406 --- /dev/null +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json @@ -0,0 +1,568 @@ +{ + "additionalProperties": false, + "properties": { + "namespace": { + "anyOf": [ + { + "minLength": 1, + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Kubernetes namespace", + "title": "Namespace" + }, + "metadata_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of the custom endpoint object", + "title": "Metadata Name" + }, + "endpoint_name": { + "anyOf": [ + { + "maxLength": 63, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of SageMaker endpoint; empty string means no creation", + "title": "Endpoint Name" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Map of environment variable names to their values", + "title": "Env" + }, + "instance_type": { + "description": "EC2 instance type for the inference server", + "pattern": "^ml\\..*", + "title": "Instance Type", + "type": "string" + }, + "metrics_enabled": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Enable metrics collection", + "title": "Metrics Enabled" + }, + "model_name": { + "description": "Name of model to create on SageMaker", + "maxLength": 63, + "minLength": 1, + "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$", + "title": "Model Name", + "type": "string" + }, + "model_version": { + "anyOf": [ + { + "maxLength": 14, + "minLength": 5, + "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Version of the model for the endpoint", + "title": "Model Version" + }, + "model_source_type": { + "description": "Source type: fsx or s3", + "enum": [ + "fsx", + "s3" + ], + "title": "Model Source Type", + "type": "string" + }, + "model_location": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Specific model data location", + "title": "Model Location" + }, + "prefetch_enabled": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Whether to pre-fetch model data", + "title": "Prefetch Enabled" + }, + "tls_certificate_output_s3_uri": { + "anyOf": [ + { + "pattern": "^s3://([^/]+)/?(.*)$", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 URI for TLS certificate output", + "title": "Tls Certificate Output S3 Uri" + }, + "image_uri": { + "description": "Inference server image name", + "title": "Image Uri", + "type": "string" + }, + "container_port": { + "description": "Port on which the model server listens", + "maximum": 65535, + "minimum": 1, + "title": "Container Port", + "type": "integer" + }, + "model_volume_mount_path": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "/opt/ml/model", + "description": "Path inside container for model volume", + "title": "Model Volume Mount Path" + }, + "model_volume_mount_name": { + "description": "Name of the model volume mount", + "title": "Model Volume Mount Name", + "type": "string" + }, + "fsx_dns_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System DNS Name", + "title": "Fsx Dns Name" + }, + "fsx_file_system_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System ID", + "title": "Fsx File System Id" + }, + "fsx_mount_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "FSX File System Mount Name", + "title": "Fsx Mount Name" + }, + "s3_bucket_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket location", + "title": "S3 Bucket Name" + }, + "s3_region": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "S3 bucket region", + "title": "S3 Region" + }, + "resources_limits": { + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Resource limits for the worker", + "title": "Resources Limits" + }, + "resources_requests": { + "anyOf": [ + { + "additionalProperties": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "string" + } + ] + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Resource requests for the worker", + "title": "Resources Requests" + }, + "dimensions": { + "anyOf": [ + { + "additionalProperties": { + "type": "string" + }, + "type": "object" + }, + { + "type": "null" + } + ], + "default": null, + "description": "CloudWatch Metric dimensions as key\u2013value pairs", + "title": "Dimensions" + }, + "metric_collection_period": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, + "description": "Defines the Period for CloudWatch query", + "title": "Metric Collection Period" + }, + "metric_collection_start_time": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": 300, + "description": "Defines the StartTime for CloudWatch query", + "title": "Metric Collection Start Time" + }, + "metric_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Metric name to query for CloudWatch trigger", + "title": "Metric Name" + }, + "metric_stat": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", + "description": "Statistics metric to be used by Trigger. Defines the Stat for the CloudWatch query. Default is Average.", + "title": "Metric Stat" + }, + "metric_type": { + "anyOf": [ + { + "enum": [ + "Value", + "Average" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": "Average", + "description": "The type of metric to be used by HPA. `Average` \u2013 Uses average value per pod; `Value` \u2013 Uses absolute metric value.", + "title": "Metric Type" + }, + "min_value": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": 0, + "description": "Minimum metric value used in case of empty response from CloudWatch. Default is 0.", + "title": "Min Value" + }, + "cloud_watch_trigger_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name for the CloudWatch trigger", + "title": "Cloud Watch Trigger Name" + }, + "cloud_watch_trigger_namespace": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "AWS CloudWatch namespace for the metric", + "title": "Cloud Watch Trigger Namespace" + }, + "target_value": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Target value for the CloudWatch metric", + "title": "Target Value" + }, + "use_cached_metrics": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": true, + "description": "Enable caching of metric values during polling interval. Default is true.", + "title": "Use Cached Metrics" + }, + "invocation_endpoint": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": "invocations", + "description": "The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. Please fill in the path after http://:/ specific to your model server.", + "title": "Invocation Endpoint" + }, + "intelligent_routing_enabled": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Enable intelligent routing", + "title": "Intelligent Routing Enabled" + }, + "routing_strategy": { + "anyOf": [ + { + "enum": [ + "prefixaware", + "kvaware", + "session", + "roundrobin" + ], + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Routing strategy for intelligent routing", + "title": "Routing Strategy" + }, + "enable_l1_cache": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Enable L1 cache (CPU offloading)", + "title": "Enable L1 Cache" + }, + "enable_l2_cache": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Enable L2 cache", + "title": "Enable L2 Cache" + }, + "l2_cache_backend": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "L2 cache backend type", + "title": "L2 Cache Backend" + }, + "l2_cache_local_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "L2 cache URL to local storage", + "title": "L2 Cache Local Url" + }, + "cache_config_file": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "KV cache configuration file path", + "title": "Cache Config File" + } + }, + "required": [ + "instance_type", + "model_name", + "model_source_type", + "image_uri", + "container_port", + "model_volume_mount_name" + ], + "title": "FlatHPEndpoint", + "type": "object" +} diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py new file mode 100644 index 00000000..ef4be13d --- /dev/null +++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py @@ -0,0 +1,100 @@ +TEMPLATE_CONTENT = """ +apiVersion: hyperpod.sagemaker.aws/v1 +kind: InferenceEndpointConfig +metadata: + name: {{ metadata_name or endpoint_name }} + namespace: {{ namespace }} +spec: + endpointName: {{ endpoint_name }} + instanceType: {{ instance_type }} + modelName: {{ model_name }} + modelVersion: {{ model_version or "" }} + + metrics: + enabled: {{ metrics_enabled or False }} + + modelSourceConfig: + modelSourceType: {{ model_source_type }} + modelLocation: {{ model_location or "" }} + prefetchEnabled: {{ prefetch_enabled or False }} +{%- if model_source_type == "s3" %} + s3Storage: + bucketName: {{ s3_bucket_name }} + region: {{ s3_region }} +{%- elif model_source_type == "fsx" %} + fsxStorage: + dnsName: {{ fsx_dns_name }} + fileSystemId: {{ fsx_file_system_id }} + mountName: {{ fsx_mount_name or "" }} +{%- endif %} + + tlsConfig: + tlsCertificateOutputS3Uri: {{ tls_certificate_output_s3_uri or "" }} + + worker: + environmentVariables: + {%- if env %} + {%- for key, val in env.items() %} + - name: {{ key }} + value: "{{ val }}" + {%- endfor %} + {%- else %} + [] + {%- endif %} + image: {{ image_uri }} + modelInvocationPort: + containerPort: {{ container_port }} + modelVolumeMount: + name: {{ model_volume_mount_name }} + mountPath: {{ model_volume_mount_path }} + resources: +{%- if resources_limits %} + limits: +{%- for key, val in resources_limits.items() %} + {{ key }}: {{ val }} +{%- endfor %} +{%- else %} + {} +{%- endif %} +{%- if resources_requests %} + requests: +{%- for key, val in resources_requests.items() %} + {{ key }}: {{ val }} +{%- endfor %} +{%- endif %} + + autoScalingSpec: + cloudWatchTrigger: +{%- if dimensions %} + dimensions: +{%- for dim_key, dim_val in dimensions.items() %} + - name: {{ dim_key }} + value: {{ dim_val }} +{%- endfor %} +{%- endif %} + metricCollectionPeriod: {{ metric_collection_period }} + metricCollectionStartTime: {{ metric_collection_start_time }} + metricName: {{ metric_name or "" }} + metricStat: {{ metric_stat }} + metricType: {{ metric_type }} + minValue: {{ min_value }} + name: {{ cloud_watch_trigger_name or "" }} + namespace: {{ cloud_watch_trigger_namespace or "" }} + targetValue: {{ target_value or "" }} + useCachedMetrics: {{ use_cached_metrics or False }} + + invocationEndpoint: "{{ invocation_endpoint }}" + +{% if intelligent_routing_enabled is not none %} intelligentRoutingSpec: + enabled: {{ intelligent_routing_enabled }} +{% if routing_strategy is not none %} routingStrategy: "{{ routing_strategy }}"{% endif %}{% endif %} +{% if enable_l1_cache is not none or enable_l2_cache is not none or cache_config_file is not none %} kvCacheSpec: +{% if enable_l1_cache is not none %} enableL1Cache: {{ enable_l1_cache }}{% endif %} +{% if enable_l2_cache is not none %} enableL2Cache: {{ enable_l2_cache }}{% endif %} +{% if l2_cache_backend is not none or l2_cache_local_url is not none %} l2CacheSpec: +{% if l2_cache_backend is not none %} l2CacheBackend: "{{ l2_cache_backend }}"{% endif %} +{% if l2_cache_local_url is not none %} l2CacheLocalUrl: "{{ l2_cache_local_url }}"{% endif %} +{% endif %} +{% if cache_config_file is not none %} cacheConfigFile: "{{ cache_config_file }}"{% endif %} +{% endif %} +""" \ No newline at end of file diff --git a/src/sagemaker/hyperpod/inference/config/constants.py b/src/sagemaker/hyperpod/inference/config/constants.py index 0f166b1e..60e4542b 100644 --- a/src/sagemaker/hyperpod/inference/config/constants.py +++ b/src/sagemaker/hyperpod/inference/config/constants.py @@ -2,9 +2,9 @@ DEFAULT_MAX_DEPLOY_TIME_IN_SECONDS = 3600 DEFAULT_MODEL_METRIC_PATH = "/metrics" DEFAULT_METRICS_SCRAPE_INTERBAL_SECONDS = 15 -INFERENCE_FULL_API_VERSION = "inference.sagemaker.aws.amazon.com/v1alpha1" +INFERENCE_FULL_API_VERSION = "inference.sagemaker.aws.amazon.com/v1" INFERENCE_GROUP = "inference.sagemaker.aws.amazon.com" -INFERENCE_API_VERSION = "v1alpha1" +INFERENCE_API_VERSION = "v1" JUMPSTART_MODEL_KIND = "JumpStartModel" JUMPSTART_MODEL_PLURAL = "jumpstartmodels" INFERENCE_ENDPOINT_CONFIG_KIND = "InferenceEndpointConfig" diff --git a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py index 8baf23de..33471286 100644 --- a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py +++ b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py @@ -70,6 +70,65 @@ class CloudWatchTrigger(BaseModel): ) +class CloudWatchTriggerList(BaseModel): + model_config = ConfigDict(extra="forbid") + + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for CloudWatch metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) + dimensions: Optional[List[Dimensions]] = Field( + default=None, description="Dimensions for Cloudwatch metrics" + ) + metricCollectionPeriod: Optional[int] = Field( + default=300, + alias="metric_collection_period", + description="Defines the Period for CloudWatch query", + ) + metricCollectionStartTime: Optional[int] = Field( + default=300, + alias="metric_collection_start_time", + description="Defines the StartTime for CloudWatch query", + ) + metricName: Optional[str] = Field( + default=None, + alias="metric_name", + description="Metric name to query for Cloudwatch trigger", + ) + metricStat: Optional[str] = Field( + default="Average", + alias="metric_stat", + description="Statistics metric to be used by Trigger. Used to define Stat for CloudWatch query. Default is Average.", + ) + metricType: Optional[Literal["Value", "Average"]] = Field( + default="Average", + alias="metric_type", + description="The type of metric to be used by HPA. Enum: AverageValue - Uses average value of metric per pod, Value - Uses absolute metric value", + ) + minValue: Optional[float] = Field( + default=0, + alias="min_value", + description="Minimum metric value used in case of empty response from CloudWatch. Default is 0.", + ) + name: Optional[str] = Field( + default=None, description="Name for the CloudWatch trigger" + ) + namespace: Optional[str] = Field( + default=None, description="AWS CloudWatch namespace for metric" + ) + targetValue: Optional[float] = Field( + default=None, + alias="target_value", + description="TargetValue for CloudWatch metric", + ) + useCachedMetrics: Optional[bool] = Field( + default=True, + alias="use_cached_metrics", + description="Enable caching of metric values during polling interval. Default is true", + ) + + class PrometheusTrigger(BaseModel): """Prometheus metric trigger to use for autoscaling""" @@ -116,6 +175,50 @@ class PrometheusTrigger(BaseModel): ) +class PrometheusTriggerList(BaseModel): + model_config = ConfigDict(extra="forbid") + + activationTargetValue: Optional[float] = Field( + default=0, + alias="activation_target_value", + description="Activation Value for Prometheus metric to scale from 0 to 1. Only applicable if minReplicaCount = 0", + ) + customHeaders: Optional[str] = Field( + default=None, + alias="custom_headers", + description="Custom headers to include while querying the prometheus endpoint.", + ) + metricType: Optional[Literal["Value", "Average"]] = Field( + default="Average", + alias="metric_type", + description="The type of metric to be used by HPA. Enum: AverageValue - Uses average value of metric per pod, Value - Uses absolute metric value", + ) + name: Optional[str] = Field( + default=None, description="Name for the Prometheus trigger" + ) + namespace: Optional[str] = Field( + default=None, description="Namespace for namespaced queries" + ) + query: Optional[str] = Field( + default=None, description="PromQLQuery for the metric." + ) + serverAddress: Optional[str] = Field( + default=None, + alias="server_address", + description="Server address for AMP workspace", + ) + targetValue: Optional[float] = Field( + default=None, + alias="target_value", + description="Target metric value for scaling", + ) + useCachedMetrics: Optional[bool] = Field( + default=True, + alias="use_cached_metrics", + description="Enable caching of metric values during polling interval. Default is true", + ) + + class AutoScalingSpec(BaseModel): model_config = ConfigDict(extra="forbid") @@ -124,6 +227,11 @@ class AutoScalingSpec(BaseModel): alias="cloud_watch_trigger", description="CloudWatch metric trigger to use for autoscaling", ) + cloudWatchTriggerList: Optional[List[CloudWatchTriggerList]] = Field( + default=None, + alias="cloud_watch_trigger_list", + description="Multiple CloudWatch metric triggers to use for autoscaling. Takes priority over CloudWatchTrigger if both are provided.", + ) cooldownPeriod: Optional[int] = Field( default=300, alias="cooldown_period", @@ -154,6 +262,11 @@ class AutoScalingSpec(BaseModel): alias="prometheus_trigger", description="Prometheus metric trigger to use for autoscaling", ) + prometheusTriggerList: Optional[List[PrometheusTriggerList]] = Field( + default=None, + alias="prometheus_trigger_list", + description="Multiple Prometheus metric triggers to use for autoscaling. Takes priority over PrometheusTrigger if both are provided.", + ) scaleDownStabilizationTime: Optional[int] = Field( default=300, alias="scale_down_stabilization_time", @@ -166,6 +279,79 @@ class AutoScalingSpec(BaseModel): ) +class IntelligentRoutingSpec(BaseModel): + """Configuration for intelligent routing This feature is currently not supported for existing deployments. Adding this configuration to an existing deployment will be rejected.""" + + model_config = ConfigDict(extra="forbid") + + autoScalingSpec: Optional[AutoScalingSpec] = Field( + default=None, alias="auto_scaling_spec" + ) + enabled: Optional[bool] = Field( + default=False, description="Once set, the enabled field cannot be modified" + ) + routingStrategy: Optional[ + Literal["prefixaware", "kvaware", "session", "roundrobin"] + ] = Field(default="prefixaware", alias="routing_strategy") + + +class L2CacheSpec(BaseModel): + """Configuration for providing L2 Cache offloading""" + + model_config = ConfigDict(extra="forbid") + + l2CacheBackend: Optional[str] = Field( + default=None, + alias="l2_cache_backend", + description="L2 cache backend type. Required when L2CacheSpec is provided.", + ) + l2CacheLocalUrl: Optional[str] = Field( + default=None, + alias="l2_cache_local_url", + description="Provide the L2 cache URL to local storage", + ) + + +class KvCacheSpec(BaseModel): + """Configuration for KV Cache specification By default L1CacheOffloading will be enabled""" + + model_config = ConfigDict(extra="forbid") + + cacheConfigFile: Optional[str] = Field( + default=None, + alias="cache_config_file", + description="KVCache configuration file path. If specified, override other configurations provided via spec", + ) + enableL1Cache: Optional[bool] = Field( + default=True, alias="enable_l1_cache", description="Enable CPU offloading" + ) + enableL2Cache: Optional[bool] = Field(default=False, alias="enable_l2_cache") + l2CacheSpec: Optional[L2CacheSpec] = Field( + default=None, + alias="l2_cache_spec", + description="Configuration for providing L2 Cache offloading", + ) + + +class LoadBalancer(BaseModel): + """Configuration for Application Load Balancer""" + + model_config = ConfigDict(extra="forbid") + + healthCheckPath: Optional[str] = Field( + default="/ping", + alias="health_check_path", + description="Health check path for the ALB target group. Defaults to /ping if not specified.", + ) + routingAlgorithm: Optional[Literal["least_outstanding_requests", "round_robin"]] = ( + Field( + default="least_outstanding_requests", + alias="routing_algorithm", + description="Routing algorithm for the ALB target group (least_oustanding_requests or round_robin)", + ) + ) + + class ModelMetrics(BaseModel): """Configuration for model container metrics scraping""" @@ -433,6 +619,13 @@ class Worker(BaseModel): model_config = ConfigDict(extra="forbid") + args: Optional[List[str]] = Field( + default=None, description="Defines the Arguments to the entrypoint." + ) + command: Optional[List[str]] = Field( + default=None, + description="Defines the Command which is Entrypoint array. Not executed within a shell.", + ) environmentVariables: Optional[List[EnvironmentVariables]] = Field( default=None, alias="environment_variables", @@ -450,6 +643,11 @@ class Worker(BaseModel): resources: Resources = Field( description="Defines the Resources in terms of CPU, GPU, Memory needed for the model to be deployed" ) + workingDir: Optional[str] = Field( + default=None, + alias="working_dir", + description="Defines the working directory of container.", + ) class _HPEndpoint(BaseModel): @@ -468,16 +666,31 @@ class _HPEndpoint(BaseModel): endpointName: Optional[str] = Field( default=None, alias="endpoint_name", - description="Name of a SageMaker endpoint to be created for this InferenceEndpointConfig. The default value of empty string, when used, will skip endpoint creation.", + description="Name used for Sagemaker Endpoint Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created.", ) instanceType: str = Field( alias="instance_type", description="Instance Type to deploy the model on" ) + intelligentRoutingSpec: Optional[IntelligentRoutingSpec] = Field( + default=None, + alias="intelligent_routing_spec", + description="Configuration for intelligent routing This feature is currently not supported for existing deployments. Adding this configuration to an existing deployment will be rejected.", + ) invocationEndpoint: Optional[str] = Field( default="invocations", alias="invocation_endpoint", description="The invocation endpoint of the model server. http://:/ would be pre-populated based on the other fields. Please fill in the path after http://:/ specific to your model server.", ) + kvCacheSpec: Optional[KvCacheSpec] = Field( + default=None, + alias="kv_cache_spec", + description="Configuration for KV Cache specification By default L1CacheOffloading will be enabled", + ) + loadBalancer: Optional[LoadBalancer] = Field( + default=None, + alias="load_balancer", + description="Configuration for Application Load Balancer", + ) metrics: Optional[Metrics] = Field( default=None, description="Configuration for metrics collection and exposure" ) @@ -641,7 +854,7 @@ class Endpoints(BaseModel): ) -class ModelMetrics(BaseModel): +class ModelMetricsStatus(BaseModel): """Status of model container metrics collection""" model_config = ConfigDict(extra="forbid") @@ -670,7 +883,7 @@ class MetricsStatus(BaseModel): alias="metrics_scrape_interval_seconds", description="Scrape interval in seconds for metrics collection from sidecar and model container.", ) - modelMetrics: Optional[ModelMetrics] = Field( + modelMetrics: Optional[ModelMetricsStatus] = Field( default=None, alias="model_metrics", description="Status of model container metrics collection", @@ -773,4 +986,4 @@ class InferenceEndpointConfigStatus(BaseModel): default=None, alias="tls_certificate", description="CertificateStatus represents the status of TLS certificates", - ) + ) \ No newline at end of file diff --git a/test/unit_tests/cli/test_inference.py b/test/unit_tests/cli/test_inference.py index 4f21b405..c9e3e695 100644 --- a/test/unit_tests/cli/test_inference.py +++ b/test/unit_tests/cli/test_inference.py @@ -9,31 +9,45 @@ # Import the non-create commands that don't need special handling from sagemaker.hyperpod.cli.commands.inference import ( - js_create, custom_create, custom_invoke, - js_list, custom_list, - js_describe, custom_describe, - js_delete, custom_delete, - js_list_pods, custom_list_pods, - js_get_logs, custom_get_logs, - js_get_operator_logs, custom_get_operator_logs + js_create, + custom_create, + custom_invoke, + js_list, + custom_list, + js_describe, + custom_describe, + js_delete, + custom_delete, + js_list_pods, + custom_list_pods, + js_get_logs, + custom_get_logs, + js_get_operator_logs, + custom_get_operator_logs, ) + # --------- JumpStart Commands --------- -@patch('sys.argv', ['pytest', '--version', '1.0']) +@patch("sys.argv", ["pytest", "--version", "1.0"]) def test_js_create_with_required_args(): """ Test js_create with all required options via CLI runner, mocking schema and endpoint. """ # Reload the inference module with mocked sys.argv - if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules: - importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference']) + if "sagemaker.hyperpod.cli.commands.inference" in sys.modules: + importlib.reload(sys.modules["sagemaker.hyperpod.cli.commands.inference"]) from sagemaker.hyperpod.cli.commands.inference import js_create - with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \ - patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') as mock_endpoint_class, \ - patch('sagemaker.hyperpod.common.cli_decorators._is_valid_jumpstart_model_id') as mock_model_validation, \ - patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') as mock_namespace_exists: + with patch( + "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version" + ) as mock_load_schema, patch( + "sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint" + ) as mock_endpoint_class, patch( + "sagemaker.hyperpod.common.cli_decorators._is_valid_jumpstart_model_id" + ) as mock_model_validation, patch( + "sagemaker.hyperpod.common.cli_decorators._namespace_exists" + ) as mock_namespace_exists: # Mock enhanced error handling mock_model_validation.return_value = True # Allow test model-id @@ -43,9 +57,9 @@ def test_js_create_with_required_args(): mock_load_schema.return_value = { "properties": { "model_id": {"type": "string"}, - "instance_type": {"type": "string"} + "instance_type": {"type": "string"}, }, - "required": ["model_id", "instance_type"] + "required": ["model_id", "instance_type"], } # Prepare mock model-to-domain mapping mock_model_class = Mock() @@ -57,16 +71,24 @@ def test_js_create_with_required_args(): mock_endpoint_class.model_construct.return_value = domain_obj jreg.SCHEMA_REGISTRY.clear() - jreg.SCHEMA_REGISTRY['1.0'] = mock_model_class + jreg.SCHEMA_REGISTRY["1.0"] = mock_model_class runner = CliRunner() - result = runner.invoke(js_create, [ - '--namespace', 'test-ns', - '--version', '1.0', - '--model-id', 'test-model-id', - '--instance-type', 'ml.t2.micro', - '--endpoint-name', 'test-endpoint' - ]) + result = runner.invoke( + js_create, + [ + "--namespace", + "test-ns", + "--version", + "1.0", + "--model-id", + "test-model-id", + "--instance-type", + "ml.t2.micro", + "--endpoint-name", + "test-endpoint", + ], + ) assert result.exit_code == 0, result.output domain_obj.create.assert_called_once_with(debug=False) @@ -76,37 +98,37 @@ def test_js_create_missing_required_args(): runner = CliRunner() result = runner.invoke(js_create, []) assert result.exit_code != 0 - assert 'Missing option' in result.output + assert "Missing option" in result.output -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_list(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))] mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_list, ['--namespace', 'ns']) + result = runner.invoke(js_list, ["--namespace", "ns"]) assert result.exit_code == 0 - inst.list.assert_called_once_with('ns') + inst.list.assert_called_once_with("ns") -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_describe(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() inst.get.return_value = Mock(model_dump=lambda: {"name": "e"}) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_describe, ['--name', 'n', '--namespace', 'ns']) + result = runner.invoke(js_describe, ["--name", "n", "--namespace", "ns"]) assert result.exit_code == 0 - inst.get.assert_called_once_with('n', 'ns') + inst.get.assert_called_once_with("n", "ns") -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_delete(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() @@ -115,38 +137,42 @@ def test_js_delete(mock_hp, mock_namespace_exists): inst.get.return_value = ep mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_delete, ['--name', 'n', '--namespace', 'ns']) + result = runner.invoke(js_delete, ["--name", "n", "--namespace", "ns"]) assert result.exit_code == 0 ep.delete.assert_called_once() -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_get_operator_logs(mock_hp): inst = Mock(get_operator_logs=Mock(return_value="ol")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_get_operator_logs, ['--since-hours', '2']) + result = runner.invoke(js_get_operator_logs, ["--since-hours", "2"]) assert result.exit_code == 0 - assert 'ol' in result.output + assert "ol" in result.output # --------- Custom Commands --------- -@patch('sys.argv', ['pytest', '--version', '1.0']) + +@patch("sys.argv", ["pytest", "--version", "1.0"]) def test_custom_create_with_required_args(): """ Test custom_create with all required options via CLI runner, mocking schema and endpoint. """ # Reload the inference module with mocked sys.argv - if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules: - importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference']) + if "sagemaker.hyperpod.cli.commands.inference" in sys.modules: + importlib.reload(sys.modules["sagemaker.hyperpod.cli.commands.inference"]) from sagemaker.hyperpod.cli.commands.inference import custom_create - with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \ - patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') as mock_endpoint_class: + with patch( + "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version" + ) as mock_load_schema, patch( + "sagemaker.hyperpod.cli.commands.inference.HPEndpoint" + ) as mock_endpoint_class: - # Mock schema loading to include storage flags + # Mock schema loading to include storage flags (v1.0 - no intelligent routing/KV cache) mock_load_schema.return_value = { "properties": { "instance_type": {"type": "string"}, @@ -156,13 +182,18 @@ def test_custom_create_with_required_args(): "s3_region": {"type": "string"}, "image_uri": {"type": "string"}, "container_port": {"type": "integer"}, - "model_volume_mount_name": {"type": "string"} + "model_volume_mount_name": {"type": "string"}, }, "required": [ - "instance_type", "model_name", "model_source_type", - "s3_bucket_name", "s3_region", - "image_uri", "container_port", "model_volume_mount_name" - ] + "instance_type", + "model_name", + "model_source_type", + "s3_bucket_name", + "s3_region", + "image_uri", + "container_port", + "model_volume_mount_name", + ], } # Prepare mock model class mock_model_class = Mock() @@ -175,21 +206,35 @@ def test_custom_create_with_required_args(): # Patch the registry mapping creg.SCHEMA_REGISTRY.clear() - creg.SCHEMA_REGISTRY['1.0'] = mock_model_class + creg.SCHEMA_REGISTRY["1.0"] = mock_model_class runner = CliRunner() - result = runner.invoke(custom_create, [ - '--namespace', 'test-ns', - '--version', '1.0', - '--instance-type', 'ml.t2.micro', - '--model-name', 'test-model', - '--model-source-type', 's3', - '--s3-bucket-name', 'test-bucket', - '--s3-region', 'us-west-2', - '--image-uri', 'test-image:latest', - '--container-port', '8080', - '--model-volume-mount-name', 'model-volume', - '--endpoint-name', 'test-endpoint' - ]) + result = runner.invoke( + custom_create, + [ + "--namespace", + "test-ns", + "--version", + "1.0", + "--instance-type", + "ml.t2.micro", + "--model-name", + "test-model", + "--model-source-type", + "s3", + "--s3-bucket-name", + "test-bucket", + "--s3-region", + "us-west-2", + "--image-uri", + "test-image:latest", + "--container-port", + "8080", + "--model-volume-mount-name", + "model-volume", + "--endpoint-name", + "test-endpoint", + ], + ) assert result.exit_code == 0, result.output domain_obj.create.assert_called_once_with(debug=False) @@ -199,11 +244,11 @@ def test_custom_create_missing_required_args(): runner = CliRunner() result = runner.invoke(custom_create, []) assert result.exit_code != 0 - assert 'Missing option' in result.output + assert "Missing option" in result.output -@patch('sagemaker.hyperpod.cli.commands.inference.Endpoint.get') -@patch('sagemaker.hyperpod.cli.commands.inference.boto3') +@patch("sagemaker.hyperpod.cli.commands.inference.Endpoint.get") +@patch("sagemaker.hyperpod.cli.commands.inference.boto3") def test_custom_invoke_success(mock_boto3, mock_endpoint_get): mock_endpoint = Mock() mock_endpoint.endpoint_status = "InService" @@ -211,54 +256,53 @@ def test_custom_invoke_success(mock_boto3, mock_endpoint_get): mock_body = Mock() mock_body.read.return_value.decode.return_value = '{"ok": true}' - mock_boto3.client.return_value.invoke_endpoint.return_value = {'Body': mock_body} + mock_boto3.client.return_value.invoke_endpoint.return_value = {"Body": mock_body} runner = CliRunner() - result = runner.invoke(custom_invoke, [ - '--endpoint-name', 'ep', - '--body', '{"x": 1}' - ]) + result = runner.invoke( + custom_invoke, ["--endpoint-name", "ep", "--body", '{"x": 1}'] + ) assert result.exit_code == 0, result.output assert '"ok": true' in result.output -@patch('sagemaker.hyperpod.cli.commands.inference.boto3') +@patch("sagemaker.hyperpod.cli.commands.inference.boto3") def test_custom_invoke_invalid_json(mock_boto3): runner = CliRunner() - result = runner.invoke(custom_invoke, ['--endpoint-name', 'ep', '--body', 'bad']) + result = runner.invoke(custom_invoke, ["--endpoint-name", "ep", "--body", "bad"]) assert result.exit_code != 0 - assert 'must be valid JSON' in result.output + assert "must be valid JSON" in result.output -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_list(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))] mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_list, ['--namespace', 'ns']) + result = runner.invoke(custom_list, ["--namespace", "ns"]) assert result.exit_code == 0 - inst.list.assert_called_once_with('ns') + inst.list.assert_called_once_with("ns") -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_describe(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() inst.get.return_value = Mock(model_dump=lambda: {"name": "e"}) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_describe, ['--name', 'n', '--namespace', 'ns']) + result = runner.invoke(custom_describe, ["--name", "n", "--namespace", "ns"]) assert result.exit_code == 0 - inst.get.assert_called_once_with('n', 'ns') + inst.get.assert_called_once_with("n", "ns") -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_delete(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock() @@ -267,81 +311,190 @@ def test_custom_delete(mock_hp, mock_namespace_exists): inst.get.return_value = ep mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_delete, ['--name', 'n', '--namespace', 'ns']) + result = runner.invoke(custom_delete, ["--name", "n", "--namespace", "ns"]) assert result.exit_code == 0 ep.delete.assert_called_once() -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_get_operator_logs(mock_hp): - inst = Mock(get_operator_logs=Mock(return_value='ol')) + inst = Mock(get_operator_logs=Mock(return_value="ol")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_get_operator_logs, ['--since-hours', '2']) + result = runner.invoke(custom_get_operator_logs, ["--since-hours", "2"]) assert result.exit_code == 0 - assert 'ol' in result.output + assert "ol" in result.output # --------- Default Namespace Tests --------- -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') + +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_list_default_namespace(mock_hp): inst = Mock(list=Mock(return_value=[])) mock_hp.model_construct.return_value = inst runner = CliRunner() result = runner.invoke(js_list, []) assert result.exit_code == 0 - inst.list.assert_called_once_with('default') + inst.list.assert_called_once_with("default") -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') + +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_list_default_namespace(mock_hp): inst = Mock(list=Mock(return_value=[])) mock_hp.model_construct.return_value = inst runner = CliRunner() result = runner.invoke(custom_list, []) assert result.exit_code == 0 - inst.list.assert_called_once_with('default') + inst.list.assert_called_once_with("default") + -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_list_pods(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock(list_pods=Mock(return_value="pods")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_list_pods, ['--namespace', 'ns', '--endpoint-name', 'js-endpoint']) + result = runner.invoke( + js_list_pods, ["--namespace", "ns", "--endpoint-name", "js-endpoint"] + ) assert result.exit_code == 0 - assert 'pods' in result.output + assert "pods" in result.output -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') + +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_list_pods(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock(list_pods=Mock(return_value="pods")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_list_pods, ['--namespace', 'ns', '--endpoint-name', 'custom-endpoint']) + result = runner.invoke( + custom_list_pods, ["--namespace", "ns", "--endpoint-name", "custom-endpoint"] + ) assert result.exit_code == 0 - assert 'pods' in result.output + assert "pods" in result.output + -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint") def test_js_get_logs(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True inst = Mock(get_logs=Mock(return_value="logs")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(js_get_logs, ['--pod-name', 'p', '--namespace', 'ns']) + result = runner.invoke(js_get_logs, ["--pod-name", "p", "--namespace", "ns"]) assert result.exit_code == 0 - assert 'logs' in result.output + assert "logs" in result.output -@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') -@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') + +@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists") +@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint") def test_custom_get_logs(mock_hp, mock_namespace_exists): mock_namespace_exists.return_value = True - inst = Mock(get_logs=Mock(return_value='l')) + inst = Mock(get_logs=Mock(return_value="l")) mock_hp.model_construct.return_value = inst runner = CliRunner() - result = runner.invoke(custom_get_logs, ['--pod-name', 'p', '--namespace', 'ns']) + result = runner.invoke(custom_get_logs, ["--pod-name", "p", "--namespace", "ns"]) assert result.exit_code == 0 - assert 'l' in result.output + assert "l" in result.output + + +@patch("sys.argv", ["pytest", "--version", "1.1"]) +def test_custom_create_with_intelligent_routing_and_kv_cache(): + """Test custom_create with intelligent routing and KV cache options.""" + + # Patch BEFORE reloading the module + with patch( + "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version" + ) as mock_load_schema, patch( + "sagemaker.hyperpod.cli.commands.inference.HPEndpoint" + ) as mock_endpoint_class: + # Set up the schema mock first + mock_load_schema.return_value = { + "properties": { + "instance_type": {"type": "string"}, + "model_name": {"type": "string"}, + "model_source_type": {"type": "string", "enum": ["s3", "fsx"]}, + "s3_bucket_name": {"type": "string"}, + "s3_region": {"type": "string"}, + "image_uri": {"type": "string"}, + "container_port": {"type": "integer"}, + "model_volume_mount_name": {"type": "string"}, + "intelligent_routing_enabled": {"type": "boolean"}, + "routing_strategy": {"type": "string"}, + "enable_l1_cache": {"type": "boolean"}, + "enable_l2_cache": {"type": "boolean"}, + "l2_cache_backend": {"type": "string"}, + "l2_cache_local_url": {"type": "string"}, + }, + "required": [ + "instance_type", + "model_name", + "model_source_type", + "s3_bucket_name", + "s3_region", + "image_uri", + "container_port", + "model_volume_mount_name", + ], + } + + # Set up the registry mock + mock_model_class = Mock() + mock_model_instance = Mock() + domain_obj = Mock() + domain_obj.create = Mock() + mock_model_instance.to_domain.return_value = domain_obj + mock_model_class.return_value = mock_model_instance + mock_endpoint_class.model_construct.return_value = domain_obj + + with patch.object(creg, "SCHEMA_REGISTRY", new={"1.1": mock_model_class}): + # NOW reload the module with all patches in place + if "sagemaker.hyperpod.cli.commands.inference" in sys.modules: + importlib.reload( + sys.modules["sagemaker.hyperpod.cli.commands.inference"] + ) + + from sagemaker.hyperpod.cli.commands.inference import custom_create + + runner = CliRunner() + result = runner.invoke( + custom_create, + [ + "--version", + "1.1", + "--instance-type", + "ml.g5.xlarge", + "--model-name", + "test-model", + "--model-source-type", + "s3", + "--s3-bucket-name", + "test-bucket", + "--s3-region", + "us-west-2", + "--image-uri", + "test-image:latest", + "--container-port", + "8080", + "--model-volume-mount-name", + "model-volume", + "--intelligent-routing-enabled", + "true", + "--routing-strategy", + "prefixaware", + "--enable-l1-cache", + "true", + "--enable-l2-cache", + "true", + "--l2-cache-backend", + "redis/sagemaker", + "--l2-cache-local-url", + "redis://redis.redis-system.svc.cluster.local:6379", + ], + ) + + assert result.exit_code == 0, result.output + domain_obj.create.assert_called_once_with(debug=False) diff --git a/test/unit_tests/inference/test_hp_endpoint.py b/test/unit_tests/inference/test_hp_endpoint.py index 74bf6b7c..10a69a72 100644 --- a/test/unit_tests/inference/test_hp_endpoint.py +++ b/test/unit_tests/inference/test_hp_endpoint.py @@ -3,8 +3,15 @@ from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint from sagemaker.hyperpod.inference.config.hp_endpoint_config import ( CloudWatchTrigger, + CloudWatchTriggerList, + PrometheusTrigger, + PrometheusTriggerList, Dimensions, AutoScalingSpec, + IntelligentRoutingSpec, + KvCacheSpec, + L2CacheSpec, + LoadBalancer, Metrics, S3Storage, ModelSourceConfig, @@ -83,6 +90,30 @@ def setUp(self): # Create metrics metrics = Metrics(enabled=True) + # Create intelligent routing spec + intelligent_routing_spec = IntelligentRoutingSpec( + enabled=True, + routing_strategy="prefixaware", + auto_scaling_spec=auto_scaling_spec + ) + + # Create KV cache spec + l2_cache_spec = L2CacheSpec( + l2_cache_backend="redis", + l2_cache_local_url="redis://localhost:6379" + ) + kv_cache_spec = KvCacheSpec( + enable_l1_cache=True, + enable_l2_cache=True, + l2_cache_spec=l2_cache_spec + ) + + # Create load balancer + load_balancer = LoadBalancer( + health_check_path="/health", + routing_algorithm="least_outstanding_requests" + ) + self.endpoint = HPEndpoint( endpoint_name="s3-test-endpoint-name", instance_type="ml.g5.xlarge", @@ -91,6 +122,9 @@ def setUp(self): model_source_config=model_source_config, worker=worker, auto_scaling_spec=auto_scaling_spec, + intelligent_routing_spec=intelligent_routing_spec, + kv_cache_spec=kv_cache_spec, + load_balancer=load_balancer, metrics=metrics, )