diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock
index b943caab..8c71acfd 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.lock
@@ -13,9 +13,9 @@ dependencies:
   version: 1.13.2
 - name: cert-manager
   repository: https://charts.jetstack.io
-  version: v1.18.0
+  version: v1.18.2
 - name: keda
   repository: https://kedacore.github.io/charts
   version: 2.17.1
-digest: sha256:5f877809dfd7c4d13b13f3de92e0824c28f80ed3abcf7c54f11764d9aeabbeba
-generated: "2025-06-19T22:21:36.075156362Z"
+digest: sha256:f54ece80a00cb4da98440551765d9c660a0704d6b59f4f9030a5a9e86eab4eea
+generated: "2025-10-27T17:20:29.746399171Z"
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
index 48c2b979..3717fd6c 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/Chart.yaml
@@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
+version: 1.0.0
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "1.16.0"
+appVersion: "2.0"
 
 dependencies:
 - name: aws-mountpoint-s3-csi-driver
@@ -45,7 +45,7 @@ dependencies:
   condition: alb.enabled
 - name: cert-manager
   alias: cert-manager
-  version: v1.18.0
+  version: v1.18.2
   repository: "https://charts.jetstack.io"
   condition: cert-manager.enabled
 - name: keda
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
index de3f762e..7f43c89a 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_inferenceendpointconfigs.yaml
@@ -1,11 +1,21 @@
----
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
+    cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert'
     controller-gen.kubebuilder.io/version: v0.16.4
   name: inferenceendpointconfigs.inference.sagemaker.aws.amazon.com
 spec:
+  conversion:
+    strategy: Webhook
+    webhook:
+      clientConfig:
+        service:
+          name: '{{ .Values.namePrefix }}-conversion-webhook'
+          namespace: '{{ .Values.shortPrefix }}-system'
+          path: /convert
+      conversionReviewVersions:
+      - v1
   group: inference.sagemaker.aws.amazon.com
   names:
     kind: InferenceEndpointConfig
@@ -14,6 +24,1365 @@ spec:
     singular: inferenceendpointconfig
   scope: Namespaced
   versions:
+  - name: v1
+    schema:
+      openAPIV3Schema:
+        description: InferenceEndpointConfig is the Schema for the inferenceendpointconfigs
+          API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: InferenceEndpointConfigSpec defines the desired state of
+              InferenceEndpointConfig.
+            properties:
+              InitialReplicaCount:
+                description: |-
+                  Number of desired pods. This is a pointer to distinguish between explicit
+                  zero and not specified. Defaults to 1.
+                format: int32
+                type: integer
+              autoScalingSpec:
+                properties:
+                  cloudWatchTrigger:
+                    description: CloudWatch metric trigger to use for autoscaling
+                    properties:
+                      activationTargetValue:
+                        default: 0
+                        description: Activation Value for CloudWatch metric to scale
+                          from 0 to 1. Only applicable if minReplicaCount = 0
+                        type: number
+                      dimensions:
+                        description: Dimensions for Cloudwatch metrics
+                        items:
+                          properties:
+                            name:
+                              description: CloudWatch Metric dimension name
+                              type: string
+                            value:
+                              description: CloudWatch Metric dimension value
+                              type: string
+                          required:
+                          - name
+                          - value
+                          type: object
+                        type: array
+                      metricCollectionPeriod:
+                        default: 300
+                        description: Defines the Period for CloudWatch query
+                        format: int32
+                        type: integer
+                      metricCollectionStartTime:
+                        default: 300
+                        description: Defines the StartTime for CloudWatch query
+                        format: int32
+                        type: integer
+                      metricName:
+                        description: Metric name to query for Cloudwatch trigger
+                        type: string
+                      metricStat:
+                        default: Average
+                        description: Statistics metric to be used by Trigger. Used
+                          to define Stat for CloudWatch query. Default is Average.
+                        type: string
+                      metricType:
+                        default: Average
+                        description: 'The type of metric to be used by HPA. Enum:
+                          AverageValue - Uses average value of metric per pod, Value
+                          - Uses absolute metric value'
+                        enum:
+                        - Value
+                        - Average
+                        type: string
+                      minValue:
+                        default: 0
+                        description: Minimum metric value used in case of empty response
+                          from CloudWatch. Default is 0.
+                        type: number
+                      name:
+                        description: Name for the CloudWatch trigger
+                        type: string
+                      namespace:
+                        description: AWS CloudWatch namespace for metric
+                        type: string
+                      targetValue:
+                        description: TargetValue for CloudWatch metric
+                        type: number
+                      useCachedMetrics:
+                        default: true
+                        description: Enable caching of metric values during polling
+                          interval. Default is true
+                        type: boolean
+                    type: object
+                  cloudWatchTriggerList:
+                    description: Multiple CloudWatch metric triggers to use for autoscaling.
+                      Takes priority over CloudWatchTrigger if both are provided.
+                    items:
+                      properties:
+                        activationTargetValue:
+                          default: 0
+                          description: Activation Value for CloudWatch metric to scale
+                            from 0 to 1. Only applicable if minReplicaCount = 0
+                          type: number
+                        dimensions:
+                          description: Dimensions for Cloudwatch metrics
+                          items:
+                            properties:
+                              name:
+                                description: CloudWatch Metric dimension name
+                                type: string
+                              value:
+                                description: CloudWatch Metric dimension value
+                                type: string
+                            required:
+                            - name
+                            - value
+                            type: object
+                          type: array
+                        metricCollectionPeriod:
+                          default: 300
+                          description: Defines the Period for CloudWatch query
+                          format: int32
+                          type: integer
+                        metricCollectionStartTime:
+                          default: 300
+                          description: Defines the StartTime for CloudWatch query
+                          format: int32
+                          type: integer
+                        metricName:
+                          description: Metric name to query for Cloudwatch trigger
+                          type: string
+                        metricStat:
+                          default: Average
+                          description: Statistics metric to be used by Trigger. Used
+                            to define Stat for CloudWatch query. Default is Average.
+                          type: string
+                        metricType:
+                          default: Average
+                          description: 'The type of metric to be used by HPA. Enum:
+                            AverageValue - Uses average value of metric per pod, Value
+                            - Uses absolute metric value'
+                          enum:
+                          - Value
+                          - Average
+                          type: string
+                        minValue:
+                          default: 0
+                          description: Minimum metric value used in case of empty
+                            response from CloudWatch. Default is 0.
+                          type: number
+                        name:
+                          description: Name for the CloudWatch trigger
+                          type: string
+                        namespace:
+                          description: AWS CloudWatch namespace for metric
+                          type: string
+                        targetValue:
+                          description: TargetValue for CloudWatch metric
+                          type: number
+                        useCachedMetrics:
+                          default: true
+                          description: Enable caching of metric values during polling
+                            interval. Default is true
+                          type: boolean
+                      type: object
+                    maxItems: 100
+                    type: array
+                  cooldownPeriod:
+                    default: 300
+                    description: The period to wait after the last trigger reported
+                      active before scaling the resource back to 0. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  initialCooldownPeriod:
+                    default: 300
+                    description: The delay before the cooldownPeriod starts after
+                      the initial creation of the ScaledObject. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  maxReplicaCount:
+                    default: 5
+                    description: The maximum number of model pods to scale to. Default
+                      5.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  minReplicaCount:
+                    default: 1
+                    description: The minimum number of model pods to scale down to.
+                      Default 1.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  pollingInterval:
+                    default: 30
+                    description: This is the interval to check each trigger on. Default
+                      30 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  prometheusTrigger:
+                    description: Prometheus metric trigger to use for autoscaling
+                    properties:
+                      activationTargetValue:
+                        default: 0
+                        description: Activation Value for Prometheus metric to scale
+                          from 0 to 1. Only applicable if minReplicaCount = 0
+                        type: number
+                      customHeaders:
+                        description: Custom headers to include while querying the
+                          prometheus endpoint.
+                        type: string
+                      metricType:
+                        default: Average
+                        description: 'The type of metric to be used by HPA. Enum:
+                          AverageValue - Uses average value of metric per pod, Value
+                          - Uses absolute metric value'
+                        enum:
+                        - Value
+                        - Average
+                        type: string
+                      name:
+                        description: Name for the Prometheus trigger
+                        type: string
+                      namespace:
+                        description: Namespace for namespaced queries
+                        type: string
+                      query:
+                        description: PromQLQuery for the metric.
+                        type: string
+                      serverAddress:
+                        description: Server address for AMP workspace
+                        pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                        type: string
+                      targetValue:
+                        description: Target metric value for scaling
+                        type: number
+                      useCachedMetrics:
+                        default: true
+                        description: Enable caching of metric values during polling
+                          interval. Default is true
+                        type: boolean
+                    type: object
+                  prometheusTriggerList:
+                    description: Multiple Prometheus metric triggers to use for autoscaling.
+                      Takes priority over PrometheusTrigger if both are provided.
+                    items:
+                      properties:
+                        activationTargetValue:
+                          default: 0
+                          description: Activation Value for Prometheus metric to scale
+                            from 0 to 1. Only applicable if minReplicaCount = 0
+                          type: number
+                        customHeaders:
+                          description: Custom headers to include while querying the
+                            prometheus endpoint.
+                          type: string
+                        metricType:
+                          default: Average
+                          description: 'The type of metric to be used by HPA. Enum:
+                            AverageValue - Uses average value of metric per pod, Value
+                            - Uses absolute metric value'
+                          enum:
+                          - Value
+                          - Average
+                          type: string
+                        name:
+                          description: Name for the Prometheus trigger
+                          type: string
+                        namespace:
+                          description: Namespace for namespaced queries
+                          type: string
+                        query:
+                          description: PromQLQuery for the metric.
+                          type: string
+                        serverAddress:
+                          description: Server address for AMP workspace
+                          pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                          type: string
+                        targetValue:
+                          description: Target metric value for scaling
+                          type: number
+                        useCachedMetrics:
+                          default: true
+                          description: Enable caching of metric values during polling
+                            interval. Default is true
+                          type: boolean
+                      type: object
+                    maxItems: 100
+                    type: array
+                  scaleDownStabilizationTime:
+                    default: 300
+                    description: The time window to stabilize for HPA before scaling
+                      down. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  scaleUpStabilizationTime:
+                    default: 0
+                    description: The time window to stabilize for HPA before scaling
+                      up. Default 0 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                type: object
+              endpointName:
+                description: |-
+                  Name used for Sagemaker Endpoint
+                  Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created.
+                maxLength: 63
+                pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$
+                type: string
+              instanceType:
+                description: Instance Type to deploy the model on
+                pattern: ^ml\..*
+                type: string
+              intelligentRoutingSpec:
+                description: |-
+                  Configuration for intelligent routing
+                  This feature is currently not supported for existing deployments.
+                  Adding this configuration to an existing deployment will be rejected.
+                properties:
+                  autoScalingSpec:
+                    properties:
+                      cloudWatchTrigger:
+                        description: CloudWatch metric trigger to use for autoscaling
+                        properties:
+                          activationTargetValue:
+                            default: 0
+                            description: Activation Value for CloudWatch metric to
+                              scale from 0 to 1. Only applicable if minReplicaCount
+                              = 0
+                            type: number
+                          dimensions:
+                            description: Dimensions for Cloudwatch metrics
+                            items:
+                              properties:
+                                name:
+                                  description: CloudWatch Metric dimension name
+                                  type: string
+                                value:
+                                  description: CloudWatch Metric dimension value
+                                  type: string
+                              required:
+                              - name
+                              - value
+                              type: object
+                            type: array
+                          metricCollectionPeriod:
+                            default: 300
+                            description: Defines the Period for CloudWatch query
+                            format: int32
+                            type: integer
+                          metricCollectionStartTime:
+                            default: 300
+                            description: Defines the StartTime for CloudWatch query
+                            format: int32
+                            type: integer
+                          metricName:
+                            description: Metric name to query for Cloudwatch trigger
+                            type: string
+                          metricStat:
+                            default: Average
+                            description: Statistics metric to be used by Trigger.
+                              Used to define Stat for CloudWatch query. Default is
+                              Average.
+                            type: string
+                          metricType:
+                            default: Average
+                            description: 'The type of metric to be used by HPA. Enum:
+                              AverageValue - Uses average value of metric per pod,
+                              Value - Uses absolute metric value'
+                            enum:
+                            - Value
+                            - Average
+                            type: string
+                          minValue:
+                            default: 0
+                            description: Minimum metric value used in case of empty
+                              response from CloudWatch. Default is 0.
+                            type: number
+                          name:
+                            description: Name for the CloudWatch trigger
+                            type: string
+                          namespace:
+                            description: AWS CloudWatch namespace for metric
+                            type: string
+                          targetValue:
+                            description: TargetValue for CloudWatch metric
+                            type: number
+                          useCachedMetrics:
+                            default: true
+                            description: Enable caching of metric values during polling
+                              interval. Default is true
+                            type: boolean
+                        type: object
+                      cloudWatchTriggerList:
+                        description: Multiple CloudWatch metric triggers to use for
+                          autoscaling. Takes priority over CloudWatchTrigger if both
+                          are provided.
+                        items:
+                          properties:
+                            activationTargetValue:
+                              default: 0
+                              description: Activation Value for CloudWatch metric
+                                to scale from 0 to 1. Only applicable if minReplicaCount
+                                = 0
+                              type: number
+                            dimensions:
+                              description: Dimensions for Cloudwatch metrics
+                              items:
+                                properties:
+                                  name:
+                                    description: CloudWatch Metric dimension name
+                                    type: string
+                                  value:
+                                    description: CloudWatch Metric dimension value
+                                    type: string
+                                required:
+                                - name
+                                - value
+                                type: object
+                              type: array
+                            metricCollectionPeriod:
+                              default: 300
+                              description: Defines the Period for CloudWatch query
+                              format: int32
+                              type: integer
+                            metricCollectionStartTime:
+                              default: 300
+                              description: Defines the StartTime for CloudWatch query
+                              format: int32
+                              type: integer
+                            metricName:
+                              description: Metric name to query for Cloudwatch trigger
+                              type: string
+                            metricStat:
+                              default: Average
+                              description: Statistics metric to be used by Trigger.
+                                Used to define Stat for CloudWatch query. Default
+                                is Average.
+                              type: string
+                            metricType:
+                              default: Average
+                              description: 'The type of metric to be used by HPA.
+                                Enum: AverageValue - Uses average value of metric
+                                per pod, Value - Uses absolute metric value'
+                              enum:
+                              - Value
+                              - Average
+                              type: string
+                            minValue:
+                              default: 0
+                              description: Minimum metric value used in case of empty
+                                response from CloudWatch. Default is 0.
+                              type: number
+                            name:
+                              description: Name for the CloudWatch trigger
+                              type: string
+                            namespace:
+                              description: AWS CloudWatch namespace for metric
+                              type: string
+                            targetValue:
+                              description: TargetValue for CloudWatch metric
+                              type: number
+                            useCachedMetrics:
+                              default: true
+                              description: Enable caching of metric values during
+                                polling interval. Default is true
+                              type: boolean
+                          type: object
+                        maxItems: 100
+                        type: array
+                      cooldownPeriod:
+                        default: 300
+                        description: The period to wait after the last trigger reported
+                          active before scaling the resource back to 0. Default 300
+                          seconds.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      initialCooldownPeriod:
+                        default: 300
+                        description: The delay before the cooldownPeriod starts after
+                          the initial creation of the ScaledObject. Default 300 seconds.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      maxReplicaCount:
+                        default: 5
+                        description: The maximum number of model pods to scale to.
+                          Default 5.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      minReplicaCount:
+                        default: 1
+                        description: The minimum number of model pods to scale down
+                          to. Default 1.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      pollingInterval:
+                        default: 30
+                        description: This is the interval to check each trigger on.
+                          Default 30 seconds.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      prometheusTrigger:
+                        description: Prometheus metric trigger to use for autoscaling
+                        properties:
+                          activationTargetValue:
+                            default: 0
+                            description: Activation Value for Prometheus metric to
+                              scale from 0 to 1. Only applicable if minReplicaCount
+                              = 0
+                            type: number
+                          customHeaders:
+                            description: Custom headers to include while querying
+                              the prometheus endpoint.
+                            type: string
+                          metricType:
+                            default: Average
+                            description: 'The type of metric to be used by HPA. Enum:
+                              AverageValue - Uses average value of metric per pod,
+                              Value - Uses absolute metric value'
+                            enum:
+                            - Value
+                            - Average
+                            type: string
+                          name:
+                            description: Name for the Prometheus trigger
+                            type: string
+                          namespace:
+                            description: Namespace for namespaced queries
+                            type: string
+                          query:
+                            description: PromQLQuery for the metric.
+                            type: string
+                          serverAddress:
+                            description: Server address for AMP workspace
+                            pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                            type: string
+                          targetValue:
+                            description: Target metric value for scaling
+                            type: number
+                          useCachedMetrics:
+                            default: true
+                            description: Enable caching of metric values during polling
+                              interval. Default is true
+                            type: boolean
+                        type: object
+                      prometheusTriggerList:
+                        description: Multiple Prometheus metric triggers to use for
+                          autoscaling. Takes priority over PrometheusTrigger if both
+                          are provided.
+                        items:
+                          properties:
+                            activationTargetValue:
+                              default: 0
+                              description: Activation Value for Prometheus metric
+                                to scale from 0 to 1. Only applicable if minReplicaCount
+                                = 0
+                              type: number
+                            customHeaders:
+                              description: Custom headers to include while querying
+                                the prometheus endpoint.
+                              type: string
+                            metricType:
+                              default: Average
+                              description: 'The type of metric to be used by HPA.
+                                Enum: AverageValue - Uses average value of metric
+                                per pod, Value - Uses absolute metric value'
+                              enum:
+                              - Value
+                              - Average
+                              type: string
+                            name:
+                              description: Name for the Prometheus trigger
+                              type: string
+                            namespace:
+                              description: Namespace for namespaced queries
+                              type: string
+                            query:
+                              description: PromQLQuery for the metric.
+                              type: string
+                            serverAddress:
+                              description: Server address for AMP workspace
+                              pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                              type: string
+                            targetValue:
+                              description: Target metric value for scaling
+                              type: number
+                            useCachedMetrics:
+                              default: true
+                              description: Enable caching of metric values during
+                                polling interval. Default is true
+                              type: boolean
+                          type: object
+                        maxItems: 100
+                        type: array
+                      scaleDownStabilizationTime:
+                        default: 300
+                        description: The time window to stabilize for HPA before scaling
+                          down. Default 300 seconds.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                      scaleUpStabilizationTime:
+                        default: 0
+                        description: The time window to stabilize for HPA before scaling
+                          up. Default 0 seconds.
+                        format: int32
+                        minimum: 0
+                        type: integer
+                    type: object
+                  enabled:
+                    default: false
+                    description: Once set, the enabled field cannot be modified
+                    type: boolean
+                  routingStrategy:
+                    default: prefixaware
+                    enum:
+                    - prefixaware
+                    - kvaware
+                    - session
+                    - roundrobin
+                    type: string
+                type: object
+              invocationEndpoint:
+                default: invocations
+                description: |-
+                  The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields.
+                  Please fill in the path after http://<host>:<port>/ specific to your model server.
+                type: string
+              kvCacheSpec:
+                description: |-
+                  Configuration for KV Cache specification
+                  By default L1CacheOffloading will be enabled
+                properties:
+                  cacheConfigFile:
+                    description: KVCache configuration file path. If specified, override
+                      other configurations provided via spec
+                    type: string
+                  enableL1Cache:
+                    default: true
+                    description: Enable CPU offloading
+                    type: boolean
+                  enableL2Cache:
+                    default: false
+                    type: boolean
+                  l2CacheSpec:
+                    description: Configuration for providing L2 Cache offloading
+                    properties:
+                      l2CacheBackend:
+                        description: L2 cache backend type. Required when L2CacheSpec
+                          is provided.
+                        pattern: (?i)redis
+                        type: string
+                      l2CacheLocalUrl:
+                        description: Provide the L2 cache URL to local storage
+                        type: string
+                    type: object
+                type: object
+              loadBalancer:
+                description: Configuration for Application Load Balancer
+                properties:
+                  healthCheckPath:
+                    default: /ping
+                    description: Health check path for the ALB target group. Defaults
+                      to /ping if not specified.
+                    pattern: ^/.*
+                    type: string
+                  routingAlgorithm:
+                    default: least_outstanding_requests
+                    description: Routing algorithm for the ALB target group (least_oustanding_requests
+                      or round_robin)
+                    enum:
+                    - least_outstanding_requests
+                    - round_robin
+                    type: string
+                type: object
+              metrics:
+                description: Configuration for metrics collection and exposure
+                properties:
+                  enabled:
+                    default: true
+                    description: Enable metrics collection for this model deployment
+                    type: boolean
+                  metricsScrapeIntervalSeconds:
+                    default: 15
+                    description: Scrape interval in seconds for metrics collection
+                      from sidecar and model container.
+                    format: int32
+                    maximum: 300
+                    minimum: 5
+                    type: integer
+                  modelMetrics:
+                    description: Configuration for model container metrics scraping
+                    properties:
+                      path:
+                        default: /metrics
+                        description: Path where the model exposes metrics
+                        pattern: ^/[a-zA-Z0-9\-_.\/]*$
+                        type: string
+                      port:
+                        default: 8080
+                        description: Port where the model exposes metrics. If not
+                          specified, a default port will be used.
+                        format: int32
+                        maximum: 65535
+                        minimum: 1024
+                        type: integer
+                    type: object
+                type: object
+              modelName:
+                description: Name of model that will be created on Sagemaker
+                maxLength: 63
+                pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}
+                type: string
+              modelSourceConfig:
+                properties:
+                  fsxStorage:
+                    properties:
+                      dnsName:
+                        description: FSX File System DNS Name
+                        type: string
+                      fileSystemId:
+                        description: FSX File System ID
+                        type: string
+                      mountName:
+                        description: FSX File System Mount Name
+                        type: string
+                    required:
+                    - fileSystemId
+                    type: object
+                  modelLocation:
+                    description: Specific location where the model data exists
+                    type: string
+                  modelSourceType:
+                    enum:
+                    - fsx
+                    - s3
+                    type: string
+                  prefetchEnabled:
+                    default: false
+                    description: In case the model seems to fit within the instance's
+                      memory (VRAM), this option can be used to pre-fetch the model
+                      to RAM and then the inference server will load to the GPU/CPU
+                      device thereafter.
+                    type: boolean
+                  s3Storage:
+                    properties:
+                      bucketName:
+                        description: S3 bucket location
+                        type: string
+                      region:
+                        description: S3 bucket region
+                        type: string
+                    required:
+                    - bucketName
+                    - region
+                    type: object
+                required:
+                - modelSourceType
+                type: object
+              modelVersion:
+                description: Version of the model used in creating sagemaker endpoint
+                type: string
+              replicas:
+                default: 1
+                description: The desired number of inference server replicas. Default
+                  1.
+                format: int32
+                type: integer
+              tags:
+                description: Mentions the tags to be added to the Sagemaker Endpoint
+                items:
+                  properties:
+                    name:
+                      type: string
+                    value:
+                      type: string
+                  required:
+                  - name
+                  - value
+                  type: object
+                type: array
+              tlsConfig:
+                description: Configurations for TLS
+                properties:
+                  tlsCertificateOutputS3Uri:
+                    pattern: ^s3://([^/]+)/?(.*)$
+                    type: string
+                type: object
+              worker:
+                description: Details of the worker
+                properties:
+                  args:
+                    description: Defines the Arguments to the entrypoint.
+                    items:
+                      type: string
+                    type: array
+                  command:
+                    description: Defines the Command which is Entrypoint array. Not
+                      executed within a shell.
+                    items:
+                      type: string
+                    type: array
+                  environmentVariables:
+                    description: |-
+                      List of environment variables to set in the container.
+                      Cannot be updated.
+                    items:
+                      description: EnvVar represents an environment variable present
+                        in a Container.
+                      properties:
+                        name:
+                          description: Name of the environment variable. Must be a
+                            C_IDENTIFIER.
+                          type: string
+                        value:
+                          description: |-
+                            Variable references $(VAR_NAME) are expanded
+                            using the previously defined environment variables in the container and
+                            any service environment variables. If a variable cannot be resolved,
+                            the reference in the input string will be unchanged. Double $$ are reduced
+                            to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e.
+                            "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)".
+                            Escaped references will never be expanded, regardless of whether the variable
+                            exists or not.
+                            Defaults to "".
+                          type: string
+                        valueFrom:
+                          description: Source for the environment variable's value.
+                            Cannot be used if value is not empty.
+                          properties:
+                            configMapKeyRef:
+                              description: Selects a key of a ConfigMap.
+                              properties:
+                                key:
+                                  description: The key to select.
+                                  type: string
+                                name:
+                                  default: ""
+                                  description: |-
+                                    Name of the referent.
+                                    This field is effectively required, but due to backwards compatibility is
+                                    allowed to be empty. Instances of this type with an empty value here are
+                                    almost certainly wrong.
+                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                  type: string
+                                optional:
+                                  description: Specify whether the ConfigMap or its
+                                    key must be defined
+                                  type: boolean
+                              required:
+                              - key
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            fieldRef:
+                              description: |-
+                                Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['<KEY>']`, `metadata.annotations['<KEY>']`,
+                                spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs.
+                              properties:
+                                apiVersion:
+                                  description: Version of the schema the FieldPath
+                                    is written in terms of, defaults to "v1".
+                                  type: string
+                                fieldPath:
+                                  description: Path of the field to select in the
+                                    specified API version.
+                                  type: string
+                              required:
+                              - fieldPath
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            resourceFieldRef:
+                              description: |-
+                                Selects a resource of the container: only resources limits and requests
+                                (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported.
+                              properties:
+                                containerName:
+                                  description: 'Container name: required for volumes,
+                                    optional for env vars'
+                                  type: string
+                                divisor:
+                                  anyOf:
+                                  - type: integer
+                                  - type: string
+                                  description: Specifies the output format of the
+                                    exposed resources, defaults to "1"
+                                  pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                                  x-kubernetes-int-or-string: true
+                                resource:
+                                  description: 'Required: resource to select'
+                                  type: string
+                              required:
+                              - resource
+                              type: object
+                              x-kubernetes-map-type: atomic
+                            secretKeyRef:
+                              description: Selects a key of a secret in the pod's
+                                namespace
+                              properties:
+                                key:
+                                  description: The key of the secret to select from.  Must
+                                    be a valid secret key.
+                                  type: string
+                                name:
+                                  default: ""
+                                  description: |-
+                                    Name of the referent.
+                                    This field is effectively required, but due to backwards compatibility is
+                                    allowed to be empty. Instances of this type with an empty value here are
+                                    almost certainly wrong.
+                                    More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                                  type: string
+                                optional:
+                                  description: Specify whether the Secret or its key
+                                    must be defined
+                                  type: boolean
+                              required:
+                              - key
+                              type: object
+                              x-kubernetes-map-type: atomic
+                          type: object
+                      required:
+                      - name
+                      type: object
+                    type: array
+                    x-kubernetes-list-map-keys:
+                    - name
+                    x-kubernetes-list-type: map
+                  image:
+                    description: The name of the inference server image to be used
+                    type: string
+                  modelInvocationPort:
+                    description: Defines the port at which the model server will listen
+                      to the invocation requests.
+                    properties:
+                      containerPort:
+                        description: Port on which the model server will be listening
+                        format: int32
+                        maximum: 65535
+                        minimum: 1
+                        type: integer
+                      name:
+                        default: http
+                        description: |-
+                          This is name for the port within the deployed container where the model will listen.
+                          This will be referred to by the Load Balancer Service.
+                          This must be an IANA_SVC_NAME (for eg. http) and unique within the pod.
+                        pattern: ^http$|^grpc$
+                        type: string
+                    required:
+                    - containerPort
+                    type: object
+                  modelVolumeMount:
+                    description: Defines the volume where model will be loaded
+                    properties:
+                      mountPath:
+                        default: /opt/ml/model
+                        description: This is the path within the container where the
+                          model data will be available for the inference server to
+                          load it to GPU,CPU or other device
+                        type: string
+                      name:
+                        description: Name of the model volume mount
+                        type: string
+                    required:
+                    - name
+                    type: object
+                  resources:
+                    description: Defines the Resources in terms of CPU, GPU, Memory
+                      needed for the model to be deployed
+                    properties:
+                      claims:
+                        description: |-
+                          Claims lists the names of resources, defined in spec.resourceClaims,
+                          that are used by this container.
+
+                          This is an alpha field and requires enabling the
+                          DynamicResourceAllocation feature gate.
+
+                          This field is immutable. It can only be set for containers.
+                        items:
+                          description: ResourceClaim references one entry in PodSpec.ResourceClaims.
+                          properties:
+                            name:
+                              description: |-
+                                Name must match the name of one entry in pod.spec.resourceClaims of
+                                the Pod where this field is used. It makes that resource available
+                                inside a container.
+                              type: string
+                            request:
+                              description: |-
+                                Request is the name chosen for a request in the referenced claim.
+                                If empty, everything from the claim is made available, otherwise
+                                only the result of this request.
+                              type: string
+                          required:
+                          - name
+                          type: object
+                        type: array
+                        x-kubernetes-list-map-keys:
+                        - name
+                        x-kubernetes-list-type: map
+                      limits:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: |-
+                          Limits describes the maximum amount of compute resources allowed.
+                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                        type: object
+                      requests:
+                        additionalProperties:
+                          anyOf:
+                          - type: integer
+                          - type: string
+                          pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                          x-kubernetes-int-or-string: true
+                        description: |-
+                          Requests describes the minimum amount of compute resources required.
+                          If Requests is omitted for a container, it defaults to Limits if that is explicitly specified,
+                          otherwise to an implementation-defined value. Requests cannot exceed Limits.
+                          More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/
+                        type: object
+                    type: object
+                  workingDir:
+                    description: Defines the working directory of container.
+                    type: string
+                required:
+                - image
+                - modelInvocationPort
+                - modelVolumeMount
+                - resources
+                type: object
+            required:
+            - instanceType
+            - modelName
+            - modelSourceConfig
+            - worker
+            type: object
+          status:
+            description: ModelDeploymentStatus defines the observed state of ModelDeployment
+            properties:
+              conditions:
+                description: Detailed conditions representing the state of the deployment
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+              deploymentStatus:
+                description: Details of the native kubernetes deployment that hosts
+                  the model
+                properties:
+                  deploymentObjectOverallState:
+                    description: Overall State of the Deployment Object
+                    type: string
+                  lastUpdated:
+                    description: Last Update Time
+                    format: date-time
+                    type: string
+                  message:
+                    description: Message populated in the root CRD while updating
+                      the status of underlying Deployment
+                    type: string
+                  name:
+                    description: Name of the Deployment Object
+                    type: string
+                  reason:
+                    description: Reason populated in the root CRD while updating the
+                      status of underlying Deployment
+                    type: string
+                  status:
+                    description: Status of the Deployment Object
+                    properties:
+                      availableReplicas:
+                        description: Total number of available pods (ready for at
+                          least minReadySeconds) targeted by this deployment.
+                        format: int32
+                        type: integer
+                      collisionCount:
+                        description: |-
+                          Count of hash collisions for the Deployment. The Deployment controller uses this
+                          field as a collision avoidance mechanism when it needs to create the name for the
+                          newest ReplicaSet.
+                        format: int32
+                        type: integer
+                      conditions:
+                        description: Represents the latest available observations
+                          of a deployment's current state.
+                        items:
+                          description: DeploymentCondition describes the state of
+                            a deployment at a certain point.
+                          properties:
+                            lastTransitionTime:
+                              description: Last time the condition transitioned from
+                                one status to another.
+                              format: date-time
+                              type: string
+                            lastUpdateTime:
+                              description: The last time this condition was updated.
+                              format: date-time
+                              type: string
+                            message:
+                              description: A human readable message indicating details
+                                about the transition.
+                              type: string
+                            reason:
+                              description: The reason for the condition's last transition.
+                              type: string
+                            status:
+                              description: Status of the condition, one of True, False,
+                                Unknown.
+                              type: string
+                            type:
+                              description: Type of deployment condition.
+                              type: string
+                          required:
+                          - status
+                          - type
+                          type: object
+                        type: array
+                        x-kubernetes-list-map-keys:
+                        - type
+                        x-kubernetes-list-type: map
+                      observedGeneration:
+                        description: The generation observed by the deployment controller.
+                        format: int64
+                        type: integer
+                      readyReplicas:
+                        description: readyReplicas is the number of pods targeted
+                          by this Deployment with a Ready Condition.
+                        format: int32
+                        type: integer
+                      replicas:
+                        description: Total number of non-terminated pods targeted
+                          by this deployment (their labels match the selector).
+                        format: int32
+                        type: integer
+                      unavailableReplicas:
+                        description: |-
+                          Total number of unavailable pods targeted by this deployment. This is the total number of
+                          pods that are still required for the deployment to have 100% available capacity. They may
+                          either be pods that are running but not yet available or pods that still have not been created.
+                        format: int32
+                        type: integer
+                      updatedReplicas:
+                        description: Total number of non-terminated pods targeted
+                          by this deployment that have the desired template spec.
+                        format: int32
+                        type: integer
+                    type: object
+                required:
+                - lastUpdated
+                - name
+                type: object
+              endpoints:
+                description: EndpointStatus contains the status of SageMaker endpoints
+                properties:
+                  sagemaker:
+                    description: Status of the SageMaker endpoint
+                    properties:
+                      configArn:
+                        description: The Amazon Resource Name (ARN) of the endpoint
+                          configuration.
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$)
+                        type: string
+                      endpointArn:
+                        description: The Amazon Resource Name (ARN) of the SageMaker
+                          endpoint
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$)
+                        type: string
+                      modelArn:
+                        description: The ARN of the model created in SageMaker.
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$)
+                        type: string
+                      state:
+                        description: The current state of the SageMaker endpoint
+                        type: string
+                    required:
+                    - state
+                    type: object
+                type: object
+              metricsStatus:
+                description: Status of metrics collection
+                properties:
+                  enabled:
+                    description: Whether metrics collection is enabled
+                    type: boolean
+                  errorMessage:
+                    description: Error message if metrics collection is in error state
+                    type: string
+                  metricsScrapeIntervalSeconds:
+                    description: Scrape interval in seconds for metrics collection
+                      from sidecar and model container.
+                    format: int32
+                    type: integer
+                  modelMetrics:
+                    description: Status of model container metrics collection
+                    properties:
+                      path:
+                        description: The path where metrics are available
+                        type: string
+                      port:
+                        description: The port on which metrics are exposed
+                        format: int32
+                        type: integer
+                    type: object
+                  state:
+                    description: Current state of metrics collection
+                    type: string
+                required:
+                - enabled
+                type: object
+              observedGeneration:
+                description: Latest generation reconciled by controller
+                format: int64
+                type: integer
+              replicas:
+                description: The observed number of inference server replicas.
+                format: int32
+                type: integer
+              selector:
+                description: LabelSelector for the deployment.
+                type: string
+              state:
+                description: Current phase of the model deployment
+                enum:
+                - DeploymentPending
+                - DeploymentInProgress
+                - DeploymentFailed
+                - DeploymentComplete
+                - DeletionPending
+                - DeletionInProgress
+                - DeletionFailed
+                - DeletionComplete
+                type: string
+              tlsCertificate:
+                description: CertificateStatus represents the status of TLS certificates
+                properties:
+                  certificateARN:
+                    description: The Amazon Resource Name (ARN) of the ACM certificate
+                    pattern: arn:aws[a-z\-]*:acm:[a-z0-9\-]*:[0-9]{12}:certificate/.*
+                    type: string
+                  certificateDomainNames:
+                    description: The certificate domain names that is attached to
+                      the certificate
+                    items:
+                      type: string
+                    type: array
+                  certificateName:
+                    description: The certificate name of cert manager
+                    type: string
+                  importedCertificates:
+                    description: Used for tracking the imported certificates to ACM
+                    items:
+                      type: string
+                    type: array
+                  issuerName:
+                    description: The issuer name of cert manager
+                    type: string
+                  lastCertExpiryTime:
+                    description: The last certificate expiry time
+                    format: date-time
+                    type: string
+                  tlsCertificateOutputS3Bucket:
+                    description: S3 bucket that stores the certificate that needs
+                      to be trusted
+                    type: string
+                  tlsCertificateS3Keys:
+                    description: The output tls certificate S3 key that points to
+                      the .pem file
+                    items:
+                      type: string
+                    type: array
+                type: object
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      scale:
+        labelSelectorPath: .status.selector
+        specReplicasPath: .spec.replicas
+        statusReplicasPath: .status.replicas
+      status: {}
   - name: v1alpha1
     schema:
       openAPIV3Schema:
@@ -214,6 +1583,7 @@ spec:
                 type: object
               endpointName:
                 description: |-
+                  Name used for Sagemaker Endpoint
                   Name of a SageMaker endpoint to be created for this InferenceEndpointConfig.
                   The default value of empty string, when used, will skip endpoint creation.
                 maxLength: 63
@@ -229,6 +1599,30 @@ spec:
                   The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields.
                   Please fill in the path after http://<host>:<port>/ specific to your model server.
                 type: string
+              loadBalancer:
+                description: Configuration for Application Load Balancer
+                properties:
+                  healthCheckPath:
+                    default: /ping
+                    description: Health check path for the ALB target group. Defaults
+                      to /ping if not specified.
+                    pattern: ^/.*
+                    type: string
+                  routingAlgorithm:
+                    default: least_outstanding_requests
+                    description: Routing algorithm for the ALB target group (least_oustanding_requests
+                      or round_robin)
+                    enum:
+                    - least_outstanding_requests
+                    - round_robin
+                    type: string
+                type: object
+              maxDeployTimeInSeconds:
+                default: 3600
+                description: Maximum allowed time in seconds for the deployment to
+                  complete before timing out. Defaults to 1 hour (3600 seconds)
+                format: int32
+                type: integer
               metrics:
                 description: Configuration for metrics collection and exposure
                 properties:
@@ -284,7 +1678,7 @@ spec:
                     - fileSystemId
                     type: object
                   modelLocation:
-                    description: Sepcific location where the model data exists
+                    description: Specific location where the model data exists
                     type: string
                   modelSourceType:
                     enum:
@@ -870,7 +2264,7 @@ spec:
             type: object
         type: object
     served: true
-    storage: true
+    storage: false
     subresources:
       scale:
         labelSelectorPath: .status.selector
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml
index 342de2bb..68ea257e 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_jumpstartmodels.yaml
@@ -1,11 +1,21 @@
----
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
+    cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert'
     controller-gen.kubebuilder.io/version: v0.16.4
   name: jumpstartmodels.inference.sagemaker.aws.amazon.com
 spec:
+  conversion:
+    strategy: Webhook
+    webhook:
+      clientConfig:
+        service:
+          name: '{{ .Values.namePrefix }}-conversion-webhook'
+          namespace: '{{ .Values.shortPrefix }}-system'
+          path: /convert
+      conversionReviewVersions:
+      - v1
   group: inference.sagemaker.aws.amazon.com
   names:
     kind: JumpStartModel
@@ -14,6 +24,780 @@ spec:
     singular: jumpstartmodel
   scope: Namespaced
   versions:
+  - name: v1
+    schema:
+      openAPIV3Schema:
+        description: JumpStartModel is the Schema for the jumpstartmodels API.
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: JumpStartModelSpec defines the desired state of JumpStartModel.
+            properties:
+              autoScalingSpec:
+                properties:
+                  cloudWatchTrigger:
+                    description: CloudWatch metric trigger to use for autoscaling
+                    properties:
+                      activationTargetValue:
+                        default: 0
+                        description: Activation Value for CloudWatch metric to scale
+                          from 0 to 1. Only applicable if minReplicaCount = 0
+                        type: number
+                      dimensions:
+                        description: Dimensions for Cloudwatch metrics
+                        items:
+                          properties:
+                            name:
+                              description: CloudWatch Metric dimension name
+                              type: string
+                            value:
+                              description: CloudWatch Metric dimension value
+                              type: string
+                          required:
+                          - name
+                          - value
+                          type: object
+                        type: array
+                      metricCollectionPeriod:
+                        default: 300
+                        description: Defines the Period for CloudWatch query
+                        format: int32
+                        type: integer
+                      metricCollectionStartTime:
+                        default: 300
+                        description: Defines the StartTime for CloudWatch query
+                        format: int32
+                        type: integer
+                      metricName:
+                        description: Metric name to query for Cloudwatch trigger
+                        type: string
+                      metricStat:
+                        default: Average
+                        description: Statistics metric to be used by Trigger. Used
+                          to define Stat for CloudWatch query. Default is Average.
+                        type: string
+                      metricType:
+                        default: Average
+                        description: 'The type of metric to be used by HPA. Enum:
+                          AverageValue - Uses average value of metric per pod, Value
+                          - Uses absolute metric value'
+                        enum:
+                        - Value
+                        - Average
+                        type: string
+                      minValue:
+                        default: 0
+                        description: Minimum metric value used in case of empty response
+                          from CloudWatch. Default is 0.
+                        type: number
+                      name:
+                        description: Name for the CloudWatch trigger
+                        type: string
+                      namespace:
+                        description: AWS CloudWatch namespace for metric
+                        type: string
+                      targetValue:
+                        description: TargetValue for CloudWatch metric
+                        type: number
+                      useCachedMetrics:
+                        default: true
+                        description: Enable caching of metric values during polling
+                          interval. Default is true
+                        type: boolean
+                    type: object
+                  cloudWatchTriggerList:
+                    description: Multiple CloudWatch metric triggers to use for autoscaling.
+                      Takes priority over CloudWatchTrigger if both are provided.
+                    items:
+                      properties:
+                        activationTargetValue:
+                          default: 0
+                          description: Activation Value for CloudWatch metric to scale
+                            from 0 to 1. Only applicable if minReplicaCount = 0
+                          type: number
+                        dimensions:
+                          description: Dimensions for Cloudwatch metrics
+                          items:
+                            properties:
+                              name:
+                                description: CloudWatch Metric dimension name
+                                type: string
+                              value:
+                                description: CloudWatch Metric dimension value
+                                type: string
+                            required:
+                            - name
+                            - value
+                            type: object
+                          type: array
+                        metricCollectionPeriod:
+                          default: 300
+                          description: Defines the Period for CloudWatch query
+                          format: int32
+                          type: integer
+                        metricCollectionStartTime:
+                          default: 300
+                          description: Defines the StartTime for CloudWatch query
+                          format: int32
+                          type: integer
+                        metricName:
+                          description: Metric name to query for Cloudwatch trigger
+                          type: string
+                        metricStat:
+                          default: Average
+                          description: Statistics metric to be used by Trigger. Used
+                            to define Stat for CloudWatch query. Default is Average.
+                          type: string
+                        metricType:
+                          default: Average
+                          description: 'The type of metric to be used by HPA. Enum:
+                            AverageValue - Uses average value of metric per pod, Value
+                            - Uses absolute metric value'
+                          enum:
+                          - Value
+                          - Average
+                          type: string
+                        minValue:
+                          default: 0
+                          description: Minimum metric value used in case of empty
+                            response from CloudWatch. Default is 0.
+                          type: number
+                        name:
+                          description: Name for the CloudWatch trigger
+                          type: string
+                        namespace:
+                          description: AWS CloudWatch namespace for metric
+                          type: string
+                        targetValue:
+                          description: TargetValue for CloudWatch metric
+                          type: number
+                        useCachedMetrics:
+                          default: true
+                          description: Enable caching of metric values during polling
+                            interval. Default is true
+                          type: boolean
+                      type: object
+                    maxItems: 100
+                    type: array
+                  cooldownPeriod:
+                    default: 300
+                    description: The period to wait after the last trigger reported
+                      active before scaling the resource back to 0. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  initialCooldownPeriod:
+                    default: 300
+                    description: The delay before the cooldownPeriod starts after
+                      the initial creation of the ScaledObject. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  maxReplicaCount:
+                    default: 5
+                    description: The maximum number of model pods to scale to. Default
+                      5.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  minReplicaCount:
+                    default: 1
+                    description: The minimum number of model pods to scale down to.
+                      Default 1.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  pollingInterval:
+                    default: 30
+                    description: This is the interval to check each trigger on. Default
+                      30 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  prometheusTrigger:
+                    description: Prometheus metric trigger to use for autoscaling
+                    properties:
+                      activationTargetValue:
+                        default: 0
+                        description: Activation Value for Prometheus metric to scale
+                          from 0 to 1. Only applicable if minReplicaCount = 0
+                        type: number
+                      customHeaders:
+                        description: Custom headers to include while querying the
+                          prometheus endpoint.
+                        type: string
+                      metricType:
+                        default: Average
+                        description: 'The type of metric to be used by HPA. Enum:
+                          AverageValue - Uses average value of metric per pod, Value
+                          - Uses absolute metric value'
+                        enum:
+                        - Value
+                        - Average
+                        type: string
+                      name:
+                        description: Name for the Prometheus trigger
+                        type: string
+                      namespace:
+                        description: Namespace for namespaced queries
+                        type: string
+                      query:
+                        description: PromQLQuery for the metric.
+                        type: string
+                      serverAddress:
+                        description: Server address for AMP workspace
+                        pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                        type: string
+                      targetValue:
+                        description: Target metric value for scaling
+                        type: number
+                      useCachedMetrics:
+                        default: true
+                        description: Enable caching of metric values during polling
+                          interval. Default is true
+                        type: boolean
+                    type: object
+                  prometheusTriggerList:
+                    description: Multiple Prometheus metric triggers to use for autoscaling.
+                      Takes priority over PrometheusTrigger if both are provided.
+                    items:
+                      properties:
+                        activationTargetValue:
+                          default: 0
+                          description: Activation Value for Prometheus metric to scale
+                            from 0 to 1. Only applicable if minReplicaCount = 0
+                          type: number
+                        customHeaders:
+                          description: Custom headers to include while querying the
+                            prometheus endpoint.
+                          type: string
+                        metricType:
+                          default: Average
+                          description: 'The type of metric to be used by HPA. Enum:
+                            AverageValue - Uses average value of metric per pod, Value
+                            - Uses absolute metric value'
+                          enum:
+                          - Value
+                          - Average
+                          type: string
+                        name:
+                          description: Name for the Prometheus trigger
+                          type: string
+                        namespace:
+                          description: Namespace for namespaced queries
+                          type: string
+                        query:
+                          description: PromQLQuery for the metric.
+                          type: string
+                        serverAddress:
+                          description: Server address for AMP workspace
+                          pattern: ^https:\/\/aps-workspaces\.[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.amazonaws\.com\/workspaces\/ws-[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$|^$
+                          type: string
+                        targetValue:
+                          description: Target metric value for scaling
+                          type: number
+                        useCachedMetrics:
+                          default: true
+                          description: Enable caching of metric values during polling
+                            interval. Default is true
+                          type: boolean
+                      type: object
+                    maxItems: 100
+                    type: array
+                  scaleDownStabilizationTime:
+                    default: 300
+                    description: The time window to stabilize for HPA before scaling
+                      down. Default 300 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                  scaleUpStabilizationTime:
+                    default: 0
+                    description: The time window to stabilize for HPA before scaling
+                      up. Default 0 seconds.
+                    format: int32
+                    minimum: 0
+                    type: integer
+                type: object
+              environmentVariables:
+                description: Additional environment variables to be passed to the
+                  inference server. Limited to 100 key-value pairs.
+                items:
+                  properties:
+                    name:
+                      type: string
+                    value:
+                      type: string
+                  required:
+                  - name
+                  - value
+                  type: object
+                maxItems: 100
+                type: array
+              loadBalancer:
+                description: Configuration for Application Load Balancer
+                properties:
+                  healthCheckPath:
+                    default: /ping
+                    description: Health check path for the ALB target group. Defaults
+                      to /ping if not specified.
+                    pattern: ^/.*
+                    type: string
+                  routingAlgorithm:
+                    default: least_outstanding_requests
+                    description: Routing algorithm for the ALB target group (least_oustanding_requests
+                      or round_robin)
+                    enum:
+                    - least_outstanding_requests
+                    - round_robin
+                    type: string
+                type: object
+              maxDeployTimeInSeconds:
+                default: 3600
+                description: Maximum allowed time in seconds for the deployment to
+                  complete before timing out. Defaults to 1 hour (3600 seconds)
+                format: int32
+                type: integer
+              metrics:
+                description: Configuration for metrics collection and exposure
+                properties:
+                  enabled:
+                    default: true
+                    description: Enable metrics collection for this model deployment
+                    type: boolean
+                  metricsScrapeIntervalSeconds:
+                    default: 15
+                    description: Scrape interval in seconds for metrics collection
+                      from sidecar and model container.
+                    format: int32
+                    maximum: 300
+                    minimum: 5
+                    type: integer
+                  modelMetrics:
+                    description: Configuration for model container metrics scraping
+                    properties:
+                      path:
+                        default: /metrics
+                        description: Path where the model exposes metrics
+                        pattern: ^/[a-zA-Z0-9\-_.\/]*$
+                        type: string
+                      port:
+                        default: 8080
+                        description: Port where the model exposes metrics. If not
+                          specified, a default port will be used.
+                        format: int32
+                        maximum: 65535
+                        minimum: 1024
+                        type: integer
+                    type: object
+                type: object
+              model:
+                properties:
+                  acceptEula:
+                    default: false
+                    description: For models that require a Model Access Config, specify
+                      True or False to indicate whether model terms of use have been
+                      accepted.
+                    type: boolean
+                  additionalConfigs:
+                    items:
+                      properties:
+                        name:
+                          type: string
+                        value:
+                          type: string
+                      required:
+                      - name
+                      - value
+                      type: object
+                    maxItems: 10
+                    type: array
+                  gatedModelDownloadRole:
+                    description: The Amazon Resource Name (ARN) of an IAM role that
+                      will be used to download gated model
+                    maxLength: 2048
+                    minLength: 20
+                    pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$
+                    type: string
+                  modelHubName:
+                    default: SageMakerPublicHub
+                    description: The name of the model hub content. Can be an ARN
+                      or a simple name.
+                    maxLength: 63
+                    pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$
+                    type: string
+                  modelId:
+                    description: The unique identifier of the model within the specified
+                      hub (hubContentArn).
+                    maxLength: 63
+                    pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$
+                    type: string
+                  modelVersion:
+                    description: The version of the model to deploy, in semantic versioning
+                      format (e.g., 1.0.0).
+                    maxLength: 14
+                    minLength: 5
+                    pattern: ^\d{1,4}.\d{1,4}.\d{1,4}$
+                    type: string
+                required:
+                - acceptEula
+                - modelId
+                type: object
+              replicas:
+                default: 1
+                description: The desired number of inference server replicas. Default
+                  1.
+                format: int32
+                type: integer
+              sageMakerEndpoint:
+                properties:
+                  name:
+                    default: ""
+                    description: Name of sagemaker endpoint. Defaults to empty string
+                      which represents that Sagemaker endpoint will not be created.
+                    maxLength: 63
+                    pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$
+                    type: string
+                type: object
+              server:
+                properties:
+                  executionRole:
+                    description: The Amazon Resource Name (ARN) of an IAM role that
+                      will be used to deploy and manage the inference server
+                    maxLength: 2048
+                    minLength: 20
+                    pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$
+                    type: string
+                  instanceType:
+                    description: The EC2 instance type to use for the inference server.
+                      Must be one of the supported types.
+                    pattern: ^ml\..*
+                    type: string
+                required:
+                - instanceType
+                type: object
+              tlsConfig:
+                properties:
+                  tlsCertificateOutputS3Uri:
+                    pattern: ^s3://([^/]+)/?(.*)$
+                    type: string
+                type: object
+            required:
+            - model
+            - server
+            type: object
+          status:
+            description: ModelDeploymentStatus defines the observed state of ModelDeployment
+            properties:
+              conditions:
+                description: Detailed conditions representing the state of the deployment
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+              deploymentStatus:
+                description: Details of the native kubernetes deployment that hosts
+                  the model
+                properties:
+                  deploymentObjectOverallState:
+                    description: Overall State of the Deployment Object
+                    type: string
+                  lastUpdated:
+                    description: Last Update Time
+                    format: date-time
+                    type: string
+                  message:
+                    description: Message populated in the root CRD while updating
+                      the status of underlying Deployment
+                    type: string
+                  name:
+                    description: Name of the Deployment Object
+                    type: string
+                  reason:
+                    description: Reason populated in the root CRD while updating the
+                      status of underlying Deployment
+                    type: string
+                  status:
+                    description: Status of the Deployment Object
+                    properties:
+                      availableReplicas:
+                        description: Total number of available pods (ready for at
+                          least minReadySeconds) targeted by this deployment.
+                        format: int32
+                        type: integer
+                      collisionCount:
+                        description: |-
+                          Count of hash collisions for the Deployment. The Deployment controller uses this
+                          field as a collision avoidance mechanism when it needs to create the name for the
+                          newest ReplicaSet.
+                        format: int32
+                        type: integer
+                      conditions:
+                        description: Represents the latest available observations
+                          of a deployment's current state.
+                        items:
+                          description: DeploymentCondition describes the state of
+                            a deployment at a certain point.
+                          properties:
+                            lastTransitionTime:
+                              description: Last time the condition transitioned from
+                                one status to another.
+                              format: date-time
+                              type: string
+                            lastUpdateTime:
+                              description: The last time this condition was updated.
+                              format: date-time
+                              type: string
+                            message:
+                              description: A human readable message indicating details
+                                about the transition.
+                              type: string
+                            reason:
+                              description: The reason for the condition's last transition.
+                              type: string
+                            status:
+                              description: Status of the condition, one of True, False,
+                                Unknown.
+                              type: string
+                            type:
+                              description: Type of deployment condition.
+                              type: string
+                          required:
+                          - status
+                          - type
+                          type: object
+                        type: array
+                        x-kubernetes-list-map-keys:
+                        - type
+                        x-kubernetes-list-type: map
+                      observedGeneration:
+                        description: The generation observed by the deployment controller.
+                        format: int64
+                        type: integer
+                      readyReplicas:
+                        description: readyReplicas is the number of pods targeted
+                          by this Deployment with a Ready Condition.
+                        format: int32
+                        type: integer
+                      replicas:
+                        description: Total number of non-terminated pods targeted
+                          by this deployment (their labels match the selector).
+                        format: int32
+                        type: integer
+                      unavailableReplicas:
+                        description: |-
+                          Total number of unavailable pods targeted by this deployment. This is the total number of
+                          pods that are still required for the deployment to have 100% available capacity. They may
+                          either be pods that are running but not yet available or pods that still have not been created.
+                        format: int32
+                        type: integer
+                      updatedReplicas:
+                        description: Total number of non-terminated pods targeted
+                          by this deployment that have the desired template spec.
+                        format: int32
+                        type: integer
+                    type: object
+                required:
+                - lastUpdated
+                - name
+                type: object
+              endpoints:
+                description: EndpointStatus contains the status of SageMaker endpoints
+                properties:
+                  sagemaker:
+                    description: Status of the SageMaker endpoint
+                    properties:
+                      configArn:
+                        description: The Amazon Resource Name (ARN) of the endpoint
+                          configuration.
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$)
+                        type: string
+                      endpointArn:
+                        description: The Amazon Resource Name (ARN) of the SageMaker
+                          endpoint
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$)
+                        type: string
+                      modelArn:
+                        description: The ARN of the model created in SageMaker.
+                        pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$)
+                        type: string
+                      state:
+                        description: The current state of the SageMaker endpoint
+                        type: string
+                    required:
+                    - state
+                    type: object
+                type: object
+              metricsStatus:
+                description: Status of metrics collection
+                properties:
+                  enabled:
+                    description: Whether metrics collection is enabled
+                    type: boolean
+                  errorMessage:
+                    description: Error message if metrics collection is in error state
+                    type: string
+                  metricsScrapeIntervalSeconds:
+                    description: Scrape interval in seconds for metrics collection
+                      from sidecar and model container.
+                    format: int32
+                    type: integer
+                  modelMetrics:
+                    description: Status of model container metrics collection
+                    properties:
+                      path:
+                        description: The path where metrics are available
+                        type: string
+                      port:
+                        description: The port on which metrics are exposed
+                        format: int32
+                        type: integer
+                    type: object
+                  state:
+                    description: Current state of metrics collection
+                    type: string
+                required:
+                - enabled
+                type: object
+              observedGeneration:
+                description: Latest generation reconciled by controller
+                format: int64
+                type: integer
+              replicas:
+                description: The observed number of inference server replicas.
+                format: int32
+                type: integer
+              selector:
+                description: LabelSelector for the deployment.
+                type: string
+              state:
+                description: Current phase of the model deployment
+                enum:
+                - DeploymentPending
+                - DeploymentInProgress
+                - DeploymentFailed
+                - DeploymentComplete
+                - DeletionPending
+                - DeletionInProgress
+                - DeletionFailed
+                - DeletionComplete
+                type: string
+              tlsCertificate:
+                description: CertificateStatus represents the status of TLS certificates
+                properties:
+                  certificateARN:
+                    description: The Amazon Resource Name (ARN) of the ACM certificate
+                    pattern: arn:aws[a-z\-]*:acm:[a-z0-9\-]*:[0-9]{12}:certificate/.*
+                    type: string
+                  certificateDomainNames:
+                    description: The certificate domain names that is attached to
+                      the certificate
+                    items:
+                      type: string
+                    type: array
+                  certificateName:
+                    description: The certificate name of cert manager
+                    type: string
+                  importedCertificates:
+                    description: Used for tracking the imported certificates to ACM
+                    items:
+                      type: string
+                    type: array
+                  issuerName:
+                    description: The issuer name of cert manager
+                    type: string
+                  lastCertExpiryTime:
+                    description: The last certificate expiry time
+                    format: date-time
+                    type: string
+                  tlsCertificateOutputS3Bucket:
+                    description: S3 bucket that stores the certificate that needs
+                      to be trusted
+                    type: string
+                  tlsCertificateS3Keys:
+                    description: The output tls certificate S3 key that points to
+                      the .pem file
+                    items:
+                      type: string
+                    type: array
+                type: object
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources:
+      scale:
+        labelSelectorPath: .status.selector
+        specReplicasPath: .spec.replicas
+        statusReplicasPath: .status.replicas
+      status: {}
   - name: v1alpha1
     schema:
       openAPIV3Schema:
@@ -219,6 +1003,24 @@ spec:
                   type: object
                 maxItems: 100
                 type: array
+              loadBalancer:
+                description: Configuration for Application Load Balancer
+                properties:
+                  healthCheckPath:
+                    default: /ping
+                    description: Health check path for the ALB target group. Defaults
+                      to /ping if not specified.
+                    pattern: ^/.*
+                    type: string
+                  routingAlgorithm:
+                    default: least_outstanding_requests
+                    description: Routing algorithm for the ALB target group (least_oustanding_requests
+                      or round_robin)
+                    enum:
+                    - least_outstanding_requests
+                    - round_robin
+                    type: string
+                type: object
               maxDeployTimeInSeconds:
                 default: 3600
                 description: Maximum allowed time in seconds for the deployment to
@@ -320,8 +1122,9 @@ spec:
                 properties:
                   name:
                     default: ""
-                    description: Name of a SageMaker endpoint to be created for this JumpStartModel.
-                      The default value of empty string, when used, will skip endpoint creation.
+                    description: |-
+                      Name of a SageMaker endpoint to be created for this JumpStartModel.
+                      The default value of empty string, when used, will skip endpoint creatio
                     maxLength: 63
                     pattern: ^$|^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$
                     type: string
@@ -642,7 +1445,7 @@ spec:
             type: object
         type: object
     served: true
-    storage: true
+    storage: false
     subresources:
       scale:
         labelSelectorPath: .status.selector
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml
index 0ff9aca8..80f1c56a 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/crd/inference.sagemaker.aws.amazon.com_sagemakerendpointregistrations.yaml
@@ -1,11 +1,21 @@
----
 apiVersion: apiextensions.k8s.io/v1
 kind: CustomResourceDefinition
 metadata:
   annotations:
+    cert-manager.io/inject-ca-from: '{{ .Values.shortPrefix }}-system/serving-cert'
     controller-gen.kubebuilder.io/version: v0.16.4
   name: sagemakerendpointregistrations.inference.sagemaker.aws.amazon.com
 spec:
+  conversion:
+    strategy: Webhook
+    webhook:
+      clientConfig:
+        service:
+          name: '{{ .Values.namePrefix }}-conversion-webhook'
+          namespace: '{{ .Values.shortPrefix }}-system'
+          path: /convert
+      conversionReviewVersions:
+      - v1
   group: inference.sagemaker.aws.amazon.com
   names:
     kind: SageMakerEndpointRegistration
@@ -14,7 +24,7 @@ spec:
     singular: sagemakerendpointregistration
   scope: Namespaced
   versions:
-  - name: v1alpha1
+  - name: v1
     schema:
       openAPIV3Schema:
         description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations
@@ -88,6 +98,10 @@ spec:
                 description: InstanceType is the ML compute instance type used for
                   EndpointConfig creation
                 type: string
+              invocationEndpoint:
+                default: invocations
+                description: The invocation endpoint path used by the model server
+                type: string
               loadBalancerHostName:
                 description: Needed to embed the LB Host Name
                 type: string
@@ -248,3 +262,241 @@ spec:
     storage: true
     subresources:
       status: {}
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        description: SageMakerEndpointRegistration is the Schema for the sagemakerendpointregistrations
+          API
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: SageMakerEndpointRegistrationSpec defines the desired state
+              of SageMakerEndpointRegistration
+            properties:
+              eksClusterDetails:
+                properties:
+                  arn:
+                    description: Stores cluster ARN
+                    type: string
+                  clusterSecurityGroupId:
+                    description: Stores ClusterSecurityGroup of the EKS Cluster
+                    type: string
+                  name:
+                    description: Stores cluster name
+                    type: string
+                  securityGroupIds:
+                    description: Stores AdditionalSecurityGroupIds of the EKS Cluster
+                    items:
+                      type: string
+                    type: array
+                  subnetIds:
+                    description: Stores SubnetIDs of the EKS Cluster
+                    items:
+                      type: string
+                    type: array
+                  vpcId:
+                    description: Stores VPC Id of the EKS Cluster
+                    type: string
+                required:
+                - arn
+                - clusterSecurityGroupId
+                - name
+                - securityGroupIds
+                - subnetIds
+                - vpcId
+                type: object
+              executionRole:
+                description: The Amazon Resource Name (ARN) of an IAM role that will
+                  be used to create model, endpoint config, and the endpoint
+                maxLength: 2048
+                minLength: 20
+                pattern: ^arn:aws[a-z\-]*:iam::\d{12}:role/?[a-zA-Z_0-9+=,.@\-_/]+$
+                type: string
+              imageUri:
+                description: The ImageUri where inference code is stored
+                maxLength: 255
+                type: string
+              instanceType:
+                description: InstanceType is the ML compute instance type used for
+                  EndpointConfig creation
+                type: string
+              invocationEndpoint:
+                default: invocations
+                description: The invocation endpoint path used by the model server
+                type: string
+              loadBalancerHostName:
+                description: Needed to embed the LB Host Name
+                type: string
+              name:
+                description: Name used for AWS resource creation
+                maxLength: 63
+                pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}
+                type: string
+              restApiId:
+                description: REST API Gateway identifier that proxies requests to
+                  the HyperPod endpoint (via NLB/ALB)
+                type: string
+              tlsConfig:
+                properties:
+                  tlsCertificateOutputS3Bucket:
+                    description: S3 bucket that stores the certificate that needs
+                      to be trusted
+                    type: string
+                  tlsCertificateS3Keys:
+                    description: The output tls certificate S3 key that points to
+                      the .pem file
+                    items:
+                      type: string
+                    type: array
+                  tlsServerNameOverride:
+                    description: The server name override for tls certificate selection
+                    type: string
+                required:
+                - tlsCertificateOutputS3Bucket
+                - tlsCertificateS3Keys
+                type: object
+            required:
+            - eksClusterDetails
+            - executionRole
+            - imageUri
+            - instanceType
+            - loadBalancerHostName
+            - name
+            - restApiId
+            - tlsConfig
+            type: object
+          status:
+            description: SageMakerEndpointRegistrationStatus defines the observed
+              state of SageMakerEndpointRegistration
+            properties:
+              conditions:
+                description: Detailed conditions representing the state of the deployment
+                items:
+                  description: Condition contains details for one aspect of the current
+                    state of this API Resource.
+                  properties:
+                    lastTransitionTime:
+                      description: |-
+                        lastTransitionTime is the last time the condition transitioned from one status to another.
+                        This should be when the underlying condition changed.  If that is not known, then using the time when the API field changed is acceptable.
+                      format: date-time
+                      type: string
+                    message:
+                      description: |-
+                        message is a human readable message indicating details about the transition.
+                        This may be an empty string.
+                      maxLength: 32768
+                      type: string
+                    observedGeneration:
+                      description: |-
+                        observedGeneration represents the .metadata.generation that the condition was set based upon.
+                        For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date
+                        with respect to the current state of the instance.
+                      format: int64
+                      minimum: 0
+                      type: integer
+                    reason:
+                      description: |-
+                        reason contains a programmatic identifier indicating the reason for the condition's last transition.
+                        Producers of specific condition types may define expected values and meanings for this field,
+                        and whether the values are considered a guaranteed API.
+                        The value should be a CamelCase string.
+                        This field may not be empty.
+                      maxLength: 1024
+                      minLength: 1
+                      pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
+                      type: string
+                    status:
+                      description: status of the condition, one of True, False, Unknown.
+                      enum:
+                      - "True"
+                      - "False"
+                      - Unknown
+                      type: string
+                    type:
+                      description: type of condition in CamelCase or in foo.example.com/CamelCase.
+                      maxLength: 316
+                      pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$
+                      type: string
+                  required:
+                  - lastTransitionTime
+                  - message
+                  - reason
+                  - status
+                  - type
+                  type: object
+                type: array
+              endpoint:
+                description: Endpoint Metadata
+                properties:
+                  arn:
+                    description: The Amazon Resource Name (ARN) of the SageMaker endpoint
+                    pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint/.*|^$)
+                    type: string
+                  configArn:
+                    description: The Amazon Resource Name (ARN) of the endpoint configuration.
+                    pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:endpoint-config/.*|^$)
+                    type: string
+                  lastModifiedTime:
+                    description: The last modified time of SageMaker endpoint.
+                    format: date-time
+                    type: string
+                  modelArn:
+                    description: The ARN of the model created in SageMaker.
+                    pattern: (arn:aws[a-z\-]*:sagemaker:[a-z0-9\-]*:[0-9]{12}:model/.*|^$)
+                    type: string
+                required:
+                - arn
+                - configArn
+                - modelArn
+                type: object
+              loadBalancer:
+                description: LoadBalancer Metadata
+                properties:
+                  hostName:
+                    description: Hostname of LoadBalancer
+                    type: string
+                required:
+                - hostName
+                type: object
+              observedGeneration:
+                description: Latest generation reconciled by controller
+                format: int64
+                type: integer
+              state:
+                description: Current phase of the Endpoint creation Step
+                enum:
+                - CreationInProgress
+                - CreationFailed
+                - CreationCompleted
+                - DeletionInProgress
+                - DeletionFailed
+                - DeletionCompleted
+                - UpdateInProgress
+                - UpdateFailed
+                - UpdateCompleted
+                type: string
+            required:
+            - state
+            type: object
+        type: object
+    served: true
+    storage: false
+    subresources:
+      status: {}
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
index daf62016..9fe34cdb 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/manager/manager.yaml
@@ -55,9 +55,14 @@ spec:
           - --metrics-bind-address=:8443
           - --leader-elect
           - --health-probe-bind-address=:8081
+          - --webhook-cert-path=/tmp/k8s-webhook-server/serving-certs
         image: "{{ .Values.image.repository }}/hyperpod-inference-operator:{{ .Values.image.tag }}"
         imagePullPolicy: {{ .Values.image.pullPolicy }}
         name: manager
+        ports:
+        - containerPort: 9443
+          name: webhook-server
+          protocol: TCP
         securityContext:
           allowPrivilegeEscalation: false
           capabilities:
@@ -73,8 +78,10 @@ spec:
           httpGet:
             path: /healthz
             port: 8081
+          initialDelaySeconds: 180
           failureThreshold: 120
           periodSeconds: 60
+          timeoutSeconds: 5
         readinessProbe:
           httpGet:
             path: /readyz
@@ -90,6 +97,10 @@ spec:
           requests:
             cpu: 10m
             memory: 64Mi
+        volumeMounts:
+        - mountPath: /tmp/k8s-webhook-server/serving-certs
+          name: webhook-certs
+          readOnly: true
         env:
           - name: HYPERPOD_CLUSTER_ARN
             value: {{ .Values.hyperpodClusterArn }}
@@ -103,5 +114,15 @@ spec:
             value: {{ .Values.eksClusterName }}
           - name: TLS_CERTIFICATE_OUTPUT_S3URI
             value: {{ .Values.tlsCertificateS3Bucket }}
+          - name: ENABLE_WEBHOOKS
+            value: "{{ .Values.enableWebhooks }}"
+          - name: CHART_VERSION
+            value: {{ .Chart.Version | quote }}
+          - name: APP_VERSION
+            value: {{ .Chart.AppVersion | quote }}
       serviceAccountName: {{ .Values.namePrefix }}-controller-manager
       terminationGracePeriodSeconds: 10
+      volumes:
+      - name: webhook-certs
+        secret:
+          secretName: webhook-server-cert
\ No newline at end of file
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml
new file mode 100644
index 00000000..d0119130
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/network-policy/allow-webhook-traffic.yaml
@@ -0,0 +1,26 @@
+# This NetworkPolicy allows ingress traffic to your webhook server running
+# as part of the controller-manager from specific namespaces and pods. CR(s) which uses webhooks
+# will only work when applied in namespaces labeled with 'webhook: enabled'
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
+  name: allow-webhook-traffic
+  namespace: {{ .Values.shortPrefix }}-system
+spec:
+  podSelector:
+    matchLabels:
+      control-plane: {{ .Values.namePrefix }}-controller-manager
+      app.kubernetes.io/name: {{ .Values.namePrefix }}
+  policyTypes:
+    - Ingress
+  ingress:
+    # This allows ingress traffic from any namespace with the label webhook: enabled
+    - from:
+      - namespaceSelector:
+          matchLabels:
+            webhook: enabled # Only from namespaces with this label
+      ports:
+        - port: 443
+          protocol: TCP
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml
index 3e4c59f1..453d3503 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/inferenceendpointconfig_editor_role.yaml
@@ -4,7 +4,6 @@ kind: ClusterRole
 metadata:
   labels:
     app.kubernetes.io/name: {{ .Values.namePrefix }}
-    app.kubernetes.io/managed-by: kustomize
   name: {{ .Values.namePrefix }}-inferenceendpointconfig-editor-role
   namespace: {{ .Values.shortPrefix }}-system
 rules:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml
index 4d0736cc..7485ea5a 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/jumpstartmodel_editor_role.yaml
@@ -4,7 +4,6 @@ kind: ClusterRole
 metadata:
   labels:
     app.kubernetes.io/name: {{ .Values.namePrefix }}
-    app.kubernetes.io/managed-by: kustomize
   name: {{ .Values.namePrefix }}-jumpstartmodels-editor-role
   namespace: {{ .Values.shortPrefix }}-system
 rules:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml
new file mode 100644
index 00000000..97d3d0b0
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role.yaml
@@ -0,0 +1,11 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: pod-reader
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "watch", "list"]
+- apiGroups: [""]
+  resources: ["services"]
+  verbs: ["get", "watch", "list"]
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml
new file mode 100644
index 00000000..0d862c6a
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_cluster_role_binding.yaml
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: pod-reader-binding
+subjects:
+- kind: ServiceAccount
+  name: pod-reader
+  namespace: hyperpod-inference-system
+roleRef:
+  kind: ClusterRole
+  name: pod-reader
+  apiGroup: rbac.authorization.k8s.io
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml
new file mode 100644
index 00000000..c5717f4a
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/pod_reader_service_account.yaml
@@ -0,0 +1,5 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: pod-reader
+  namespace: hyperpod-inference-system
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml
index 7009a510..298ebcc2 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_editor_role.yaml
@@ -3,7 +3,7 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
-    app.kubernetes.io/name: AWSCrescendoInferenceOperator
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
   name: {{ .Values.namePrefix }}-sagemakerendpointregistration-editor-role
   namespace: {{ .Values.shortPrefix }}-system
 rules:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml
index 795ae656..77472ab3 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/rbac/sagemakerendpointregistration_viewer_role.yaml
@@ -3,7 +3,7 @@ apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
   labels:
-    app.kubernetes.io/name: AWSCrescendoInferenceOperator
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
   name: {{ .Values.namePrefix }}-sagemakerendpointregistration-viewer-role
   namespace: {{ .Values.shortPrefix }}-system
 rules:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml
deleted file mode 100644
index 6ad65b64..00000000
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-llm-phi-3-mini-4k-instruct.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
-kind: ModelDeployment
-metadata:
-  labels:
-    app.kubernetes.io/name: huggingface-llm-phi-3-mini-4k-instruct-app
-  name: modeldeployment-sample
-spec:
-  sageMakerEndpoint:
-    name: sample-sagemaker-endpoint-phi-3
-  model:
-    modelHubName: SageMakerPublicHub
-    modelId: huggingface-llm-phi-3-mini-4k-instruct
-    modelVersion: 1.2.2
-  server:
-    instanceType: ml.g5.8xlarge
-    maxAutoScaleReplicas: 2
-    minAutoScaleReplicas: 1
-    namespace: inference-namespace
-  environmentVariables:
-    - name: SAMPLE_ENV_VAR
-      value: "sample_value"
-  maxDeployTimeInSeconds: 1800
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml
deleted file mode 100644
index 11736305..00000000
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/huggingface-text2text-flan-t5-xl.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
-kind: ModelDeployment
-metadata:
-  labels:
-    app.kubernetes.io/name: huggingface-text2text-flan-t5-xl-app
-  name: modeldeployment-sample
-spec:
-  sageMakerEndpoint:
-    name: sample-sagemaker-endpoint-t2t-flan
-  model:
-    acceptEula: true
-    modelHubName: SageMakerPublicHub
-    modelId: huggingface-text2text-flan-t5-xl
-    modelVersion: 1.0.0
-  server:
-    instanceType: ml.g5.8xlarge
-    maxAutoScaleReplicas: 2
-    minAutoScaleReplicas: 1
-    namespace: inference-namespace
-  environmentVariables:
-    - name: SAMPLE_ENV_VAR
-      value: "sample_value"
-  maxDeployTimeInSeconds: 1800
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml
similarity index 90%
rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml
rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml
index 9214dc5a..5a065ed7 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model15b.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_deepseek_model15b.yaml
@@ -1,4 +1,4 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
+apiVersion: inference.sagemaker.aws.amazon.com/v1
 kind: JumpStartModel
 metadata:
   labels:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml
new file mode 100644
index 00000000..eb423d22
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_intelligent_routing.yaml
@@ -0,0 +1,45 @@
+apiVersion: inference.sagemaker.aws.amazon.com/v1
+kind: InferenceEndpointConfig
+metadata:
+    name: llama-8b-intel-routing
+    namespace: ns-team-a
+spec:
+    endpointName: llama-8b-sme
+    modelName: Llama-3.1-8B-Instruct
+    instanceType: ml.g5.8xlarge
+    invocationEndpoint: v1/chat/completions
+    intelligentRoutingSpec:
+        enabled: true
+        routingStrategy: prefixaware
+    modelSourceConfig:
+        modelSourceType: s3
+        s3Storage:
+            bucketName: <PLACEHOLDER_MODEL_STORAGE_BUCKET>
+            region: us-west-2
+        modelLocation: llama31_8b
+        prefetchEnabled: false
+    kvCacheSpec:
+        enableL1Cache: true
+    tlsConfig: # optional field, default value from operator deployment used if tlsConfig is omitted
+        tlsCertificateOutputS3Uri: <PLACEHOLDER_TLS_CERT_S3_URI> # e.g.: s3://tls-certs-bucket/certs
+    worker:
+        resources:
+            limits:
+                nvidia.com/gpu: "1"
+            requests:
+                cpu: "6"
+                memory: 30Gi
+                nvidia.com/gpu: "1"
+        image: lmcache/vllm-openai:v0.3.7
+        args:
+        - "/opt/ml/model"
+        - "--max-model-len"
+        - "4096"
+        modelInvocationPort:
+            containerPort: 8000
+            name: http
+        modelVolumeMount:
+            name: model-weights
+            mountPath: /opt/ml/model
+    loadBalancer:
+        healthCheckPath: /health
\ No newline at end of file
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml
new file mode 100644
index 00000000..0d7e7c06
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_llama31_8b_kv_cache_l1_l2.yaml
@@ -0,0 +1,44 @@
+apiVersion: inference.sagemaker.aws.amazon.com/v1
+kind: InferenceEndpointConfig
+metadata:
+    name: llama-8b-kv-cache-l1-l2
+    namespace: ns-team-a
+spec:
+    endpointName: llama-8b-sme
+    modelName: Llama-3.1-8B-Instruct
+    instanceType: ml.g5.8xlarge
+    invocationEndpoint: v1/chat/completions
+    modelSourceConfig:
+        modelSourceType: s3
+        s3Storage:
+            bucketName: <PLACEHOLDER_MODEL_STORAGE_BUCKET>
+            region: us-west-2
+        modelLocation: llama31_8b
+        prefetchEnabled: false
+    kvCacheSpec:
+        enableL1Cache: true
+        enableL2Cache: true
+        l2CacheSpec:
+            l2CacheBackend: redis
+            l2CacheLocalUrl: <PLACEHOLDER_L2_CACHE_LOCAL_URL> # e.g.: redis://redis.ns-team-a.svc.cluster.local:6379
+    tlsConfig: # optional field, default value from operator deployment used if tlsConfig is omitted
+        tlsCertificateOutputS3Uri: <PLACEHOLDER_TLS_CERT_S3_URI> # e.g.: s3://tls-certs-bucket/certs
+    worker:
+        resources:
+            limits:
+                nvidia.com/gpu: "1"
+            requests:
+                cpu: "6"
+                memory: 30Gi
+                nvidia.com/gpu: "1"
+        image: lmcache/vllm-openai:v0.3.7
+        args:
+        - "/opt/ml/model"
+        - "--max-model-len"
+        - "4096"
+        modelInvocationPort:
+            containerPort: 8000
+            name: http
+        modelVolumeMount:
+            name: model-weights
+            mountPath: /opt/ml/model
\ No newline at end of file
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml
similarity index 91%
rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml
rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml
index 85b10844..89376139 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_modeldeployment.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_modeldeployment.yaml
@@ -1,4 +1,4 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
+apiVersion: inference.sagemaker.aws.amazon.com/v1
 kind: ModelDeployment
 metadata:
   labels:
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml
similarity index 96%
rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml
rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml
index 5857cabd..e6e9c16c 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_deepseek15b.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_deepseek15b.yaml
@@ -1,4 +1,4 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
+apiVersion: inference.sagemaker.aws.amazon.com/v1
 kind: InferenceEndpointConfig
 metadata:
   name: deepseeks3
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml
similarity index 96%
rename from helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml
rename to helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml
index 87f81ecb..deb7cbb4 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_s3_modeldeployment.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1_s3_modeldeployment.yaml
@@ -1,4 +1,4 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
+apiVersion: inference.sagemaker.aws.amazon.com/v1
 kind: InferenceEndpointConfig
 metadata:
   name: testing-custom-deployment-inf
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml
deleted file mode 100644
index 7420f1f1..00000000
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_deepseek_model7b.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
-kind: ModelDeployment
-metadata:
-  labels:
-    app.kubernetes.io/name: deepseek-llm-r1-distill-qwen-7b-app
-  name: deepseek-sample
-spec:
-  sageMakerEndpoint:
-    name: deepsek7bsme
-  model:
-    modelHubName: SageMakerPublicHub
-    modelId: deepseek-llm-r1-distill-qwen-7b
-    modelVersion: 2.0.4
-  server:
-    instanceType: ml.g5.48xlarge
-    maxAutoScaleReplicas: 2
-    minAutoScaleReplicas: 1
-    namespace: inference-namespace
-  environmentVariables:
-    - name: SAMPLE_ENV_VAR
-      value: "sample_value"
-  maxDeployTimeInSeconds: 1800
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml
deleted file mode 100644
index c9208fcb..00000000
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/samples/v1alpha1_mistral_model.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-apiVersion: inference.sagemaker.aws.amazon.com/v1alpha1
-kind: ModelDeployment
-metadata:
-  labels:
-    app.kubernetes.io/name: huggingface-llm-mistral-7b-instruct-app
-    kueue.x-k8s.io/priority-class: real-time-inference
-  name: sample-mistral
-  namespace: ns-team-a
-spec:
-  sageMakerEndpoint:
-    name: sample-sagemaker-endpoint
-  model:
-    acceptEula: true
-    modelHubName: SageMakerPublicHub
-    modelId: huggingface-llm-mistral-7b-instruct
-  server:
-    instanceType: ml.g5.8xlarge
-    maxAutoScaleReplicas: 2
-    minAutoScaleReplicas: 1
-    namespace: inference-namespace
-  environmentVariables:
-    - name: SAMPLE_ENV_VAR
-      value: "sample_value"
-  maxDeployTimeInSeconds: 1800
\ No newline at end of file
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml
new file mode 100644
index 00000000..52696fef
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/certificate-webhook.yaml
@@ -0,0 +1,17 @@
+# The following manifests contain a self-signed issuer CR and a certificate CR.
+# More documentation can be found at https://docs.cert-manager.io
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
+  name: serving-cert
+  namespace: {{ .Values.shortPrefix }}-system
+spec:
+  dnsNames:
+  - {{ .Values.namePrefix }}-conversion-webhook.{{ .Values.shortPrefix }}-system.svc
+  - {{ .Values.namePrefix }}-conversion-webhook.{{ .Values.shortPrefix }}-system.svc.cluster.local
+  issuerRef:
+    kind: Issuer
+    name: selfsigned-issuer
+  secretName: webhook-server-cert
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml
new file mode 100644
index 00000000..c5d47eb9
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/issuer.yaml
@@ -0,0 +1,12 @@
+# The following manifest contains a self-signed issuer CR.
+# More information can be found at https://docs.cert-manager.io
+# WARNING: Targets CertManager v1.0. Check https://cert-manager.io/docs/installation/upgrading/ for breaking changes.
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
+  name: selfsigned-issuer
+  namespace: {{ .Values.shortPrefix }}-system
+spec:
+  selfSigned: {}
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml
new file mode 100644
index 00000000..05f2ba95
--- /dev/null
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/config/webhook/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1 # dummy change
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/name: {{ .Values.namePrefix }}
+  name: {{ .Values.namePrefix }}-conversion-webhook
+  namespace: {{ .Values.shortPrefix }}-system
+spec:
+  ports:
+    - port: 443
+      protocol: TCP
+      targetPort: 9443
+  selector:
+    control-plane: {{ .Values.namePrefix }}-controller-manager
\ No newline at end of file
diff --git a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
index d9a1a374..868b7765 100644
--- a/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
+++ b/helm_chart/HyperPodHelmChart/charts/inference-operator/values.yaml
@@ -21,7 +21,7 @@ image:
     ap-southeast-4: 311141544681.dkr.ecr.ap-southeast-4.amazonaws.com
     ap-southeast-3: 158128612970.dkr.ecr.ap-southeast-3.amazonaws.com
     eu-south-2: 025050981094.dkr.ecr.eu-south-2.amazonaws.com
-  tag: v1.0.0
+  tag: v2.0
   pullPolicy: Always
   repository:
 hyperpodClusterArn:
@@ -29,6 +29,7 @@ executionRoleArn:
 jumpstartGatedModelDownloadRoleArn: ""
 stage: "prod"
 tlsCertificateS3Bucket:
+enableWebhooks: true
 
 s3:
   enabled: true
@@ -62,6 +63,7 @@ fsx:
 
 alb:
   enabled: true
+  enableServiceMutatorWebhook: false
   clusterName: ""
   region: ""
   vpcId: ""
@@ -99,3 +101,6 @@ components:
   metricsService:
     enabled: true
     path: "config/metrics"
+  webhook:
+    enabled: true
+    path: "config/webhook"
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py
index 68054b98..65490521 100644
--- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/__init__.py
@@ -9,4 +9,4 @@
 # or in the "license" file accompanying this file. This file is
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
\ No newline at end of file
+# language governing permissions and limitations under the License.
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py
index 1da3df96..5fbb3832 100644
--- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/registry.py
@@ -10,13 +10,18 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-from hyperpod_custom_inference_template.v1_0 import model as v1
-from hyperpod_custom_inference_template.v1_0.template import TEMPLATE_CONTENT as v1_template
+from hyperpod_custom_inference_template.v1_0 import model as v1_0
+from hyperpod_custom_inference_template.v1_1 import model as v1_1
+from hyperpod_custom_inference_template.v1_0.template import (
+    TEMPLATE_CONTENT as v1_0_template,
+)
+from hyperpod_custom_inference_template.v1_1.template import (
+    TEMPLATE_CONTENT as v1_1_template,
+)
 
 SCHEMA_REGISTRY = {
-    "1.0": v1.FlatHPEndpoint,
+    "1.0": v1_0.FlatHPEndpoint,
+    "1.1": v1_1.FlatHPEndpoint,
 }
 
-TEMPLATE_REGISTRY = {
-    "1.0": v1_template
-}
+TEMPLATE_REGISTRY = {"1.0": v1_0_template, "1.1": v1_1_template}
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py
index 68054b98..65490521 100644
--- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/__init__.py
@@ -9,4 +9,4 @@
 # or in the "license" file accompanying this file. This file is
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
-# language governing permissions and limitations under the License.
\ No newline at end of file
+# language governing permissions and limitations under the License.
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py
index 2e0e544e..1ec8b5c3 100644
--- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/model.py
@@ -27,7 +27,7 @@
     Worker,
     Dimensions,
     AutoScalingSpec,
-    CloudWatchTrigger
+    CloudWatchTrigger,
 )
 from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
 from sagemaker.hyperpod.common.config.metadata import Metadata
@@ -37,12 +37,10 @@ class FlatHPEndpoint(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
     namespace: Optional[str] = Field(
-        default=None, 
-        description="Kubernetes namespace",
-        min_length=1
+        default=None, description="Kubernetes namespace", min_length=1
     )
 
-    metadata_name: Optional[str]  = Field(
+    metadata_name: Optional[str] = Field(
         None,
         alias="metadata_name",
         description="Name of the custom endpoint object",
@@ -75,14 +73,15 @@ class FlatHPEndpoint(BaseModel):
 
     # metrics.*
     metrics_enabled: Optional[bool] = Field(
-        False, alias="metrics_enabled",
+        False,
+        alias="metrics_enabled",
         description="Enable metrics collection",
     )
 
     # model_name and version
     model_name: str = Field(
-        ..., 
-        alias="model_name", 
+        ...,
+        alias="model_name",
         description="Name of model to create on SageMaker",
         min_length=1,
         max_length=63,
@@ -100,15 +99,18 @@ class FlatHPEndpoint(BaseModel):
 
     # model_source_config.*
     model_source_type: Literal["fsx", "s3"] = Field(
-        ..., alias="model_source_type",
+        ...,
+        alias="model_source_type",
         description="Source type: fsx or s3",
     )
     model_location: Optional[str] = Field(
-        None, alias="model_location",
+        None,
+        alias="model_location",
         description="Specific model data location",
     )
     prefetch_enabled: Optional[bool] = Field(
-        False, alias="prefetch_enabled",
+        False,
+        alias="prefetch_enabled",
         description="Whether to pre-fetch model data",
     )
 
@@ -122,11 +124,12 @@ class FlatHPEndpoint(BaseModel):
 
     # worker.*
     image_uri: str = Field(
-        ..., alias="image_uri",
+        ...,
+        alias="image_uri",
         description="Inference server image name",
     )
     container_port: int = Field(
-        ..., 
+        ...,
         alias="container_port",
         description="Port on which the model server listens",
         ge=1,
@@ -138,7 +141,8 @@ class FlatHPEndpoint(BaseModel):
         description="Path inside container for model volume",
     )
     model_volume_mount_name: str = Field(
-        ..., alias="model_volume_mount_name",
+        ...,
+        alias="model_volume_mount_name",
         description="Name of the model volume mount",
     )
 
@@ -149,7 +153,7 @@ class FlatHPEndpoint(BaseModel):
         description="FSX File System DNS Name",
     )
     fsx_file_system_id: Optional[str] = Field(
-        None,  
+        None,
         alias="fsx_file_system_id",
         description="FSX File System ID",
     )
@@ -161,23 +165,23 @@ class FlatHPEndpoint(BaseModel):
 
     # S3Storage
     s3_bucket_name: Optional[str] = Field(
-        None, 
+        None,
         alias="s3_bucket_name",
         description="S3 bucket location",
     )
     s3_region: Optional[str] = Field(
-        None, 
+        None,
         alias="s3_region",
         description="S3 bucket region",
     )
 
     # Resources
-    resources_limits: Optional[Dict[str, Union[int,str]]] = Field(
+    resources_limits: Optional[Dict[str, Union[int, str]]] = Field(
         None,
         alias="resources_limits",
         description="Resource limits for the worker",
     )
-    resources_requests: Optional[Dict[str, Union[int,str]]] = Field(
+    resources_requests: Optional[Dict[str, Union[int, str]]] = Field(
         None,
         alias="resources_requests",
         description="Resource requests for the worker",
@@ -187,28 +191,25 @@ class FlatHPEndpoint(BaseModel):
     dimensions: Optional[Dict[str, str]] = Field(
         None,
         alias="dimensions",
-        description="CloudWatch Metric dimensions as key–value pairs"
+        description="CloudWatch Metric dimensions as key–value pairs",
     )
 
     # CloudWatch Trigger
     metric_collection_period: Optional[int] = Field(
-        300,
-        description="Defines the Period for CloudWatch query"
+        300, description="Defines the Period for CloudWatch query"
     )
     metric_collection_start_time: Optional[int] = Field(
-        300,
-        description="Defines the StartTime for CloudWatch query"
+        300, description="Defines the StartTime for CloudWatch query"
     )
     metric_name: Optional[str] = Field(
-        None,
-        description="Metric name to query for CloudWatch trigger"
+        None, description="Metric name to query for CloudWatch trigger"
     )
     metric_stat: Optional[str] = Field(
         "Average",
         description=(
             "Statistics metric to be used by Trigger. "
             "Defines the Stat for the CloudWatch query. Default is Average."
-        )
+        ),
     )
     metric_type: Optional[Literal["Value", "Average"]] = Field(
         "Average",
@@ -216,33 +217,30 @@ class FlatHPEndpoint(BaseModel):
             "The type of metric to be used by HPA. "
             "`Average` – Uses average value per pod; "
             "`Value` – Uses absolute metric value."
-        )
+        ),
     )
     min_value: Optional[float] = Field(
         0,
         description=(
             "Minimum metric value used in case of empty response "
             "from CloudWatch. Default is 0."
-        )
+        ),
     )
     cloud_watch_trigger_name: Optional[str] = Field(
-        None,
-        description="Name for the CloudWatch trigger"
+        None, description="Name for the CloudWatch trigger"
     )
     cloud_watch_trigger_namespace: Optional[str] = Field(
-        None,
-        description="AWS CloudWatch namespace for the metric"
+        None, description="AWS CloudWatch namespace for the metric"
     )
     target_value: Optional[float] = Field(
-        None,
-        description="Target value for the CloudWatch metric"
+        None, description="Target value for the CloudWatch metric"
     )
     use_cached_metrics: Optional[bool] = Field(
         True,
         description=(
             "Enable caching of metric values during polling interval. "
             "Default is true."
-        )
+        ),
     )
 
     invocation_endpoint: Optional[str] = Field(
@@ -250,21 +248,25 @@ class FlatHPEndpoint(BaseModel):
         description=(
             "The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields. "
             "Please fill in the path after http://<host>:<port>/ specific to your model server.",
-        )
+        ),
     )
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_model_source_config(self):
         """Validate that required fields are provided based on model_source_type"""
         if self.model_source_type == "s3":
             if not self.s3_bucket_name or not self.s3_region:
-                raise ValueError("s3_bucket_name and s3_region are required when model_source_type is 's3'")
+                raise ValueError(
+                    "s3_bucket_name and s3_region are required when model_source_type is 's3'"
+                )
         elif self.model_source_type == "fsx":
             if not self.fsx_file_system_id:
-                raise ValueError("fsx_file_system_id is required when model_source_type is 'fsx'")
+                raise ValueError(
+                    "fsx_file_system_id is required when model_source_type is 'fsx'"
+                )
         return self
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_name(self):
         if not self.metadata_name and not self.endpoint_name:
             raise ValueError("Either metadata_name or endpoint_name must be provided")
@@ -273,21 +275,20 @@ def validate_name(self):
     def to_domain(self) -> HPEndpoint:
         if self.endpoint_name and not self.metadata_name:
             self.metadata_name = self.endpoint_name
-            
+
         metadata = Metadata(name=self.metadata_name, namespace=self.namespace)
 
         env_vars = None
         if self.env:
             env_vars = [
-                EnvironmentVariables(name=k, value=v)
-                for k, v in self.env.items()
+                EnvironmentVariables(name=k, value=v) for k, v in self.env.items()
             ]
 
         dim_vars: list[Dimensions] = []
         if self.dimensions:
             for name, value in self.dimensions.items():
                 dim_vars.append(Dimensions(name=name, value=value))
-        
+
         cloud_watch_trigger = CloudWatchTrigger(
             dimensions=dim_vars,
             metric_collection_period=self.metric_collection_period,
@@ -300,12 +301,10 @@ def to_domain(self) -> HPEndpoint:
             namespace=self.cloud_watch_trigger_namespace,
             target_value=self.target_value,
             use_cached_metrics=self.use_cached_metrics,
-        ) 
-
-        auto_scaling_spec = AutoScalingSpec(
-            cloud_watch_trigger = cloud_watch_trigger
         )
 
+        auto_scaling_spec = AutoScalingSpec(cloud_watch_trigger=cloud_watch_trigger)
+
         # nested metrics
         metrics = Metrics(
             enabled=self.metrics_enabled,
@@ -336,7 +335,9 @@ def to_domain(self) -> HPEndpoint:
             fsx_storage=fsx,
         )
 
-        tls = TlsConfig(tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri)
+        tls = TlsConfig(
+            tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri
+        )
 
         invocation_port = ModelInvocationPort(
             container_port=self.container_port,
@@ -368,4 +369,4 @@ def to_domain(self) -> HPEndpoint:
             worker=worker,
             invocation_endpoint=self.invocation_endpoint,
             auto_scaling_spec=auto_scaling_spec
-        )
\ No newline at end of file
+        )
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py
index 63b06fb0..4c981770 100644
--- a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_0/template.py
@@ -85,4 +85,4 @@
   
   invocationEndpoint: {{ invocation_endpoint }}
 
-"""
\ No newline at end of file
+"""
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py
new file mode 100644
index 00000000..65490521
--- /dev/null
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/__init__.py
@@ -0,0 +1,12 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py
new file mode 100644
index 00000000..bc586c42
--- /dev/null
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/model.py
@@ -0,0 +1,442 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from pydantic import BaseModel, Field, model_validator, ConfigDict
+from typing import Optional, List, Dict, Union, Literal
+
+from sagemaker.hyperpod.inference.config.hp_endpoint_config import (
+    Metrics,
+    FsxStorage,
+    S3Storage,
+    ModelSourceConfig,
+    TlsConfig,
+    EnvironmentVariables,
+    ModelInvocationPort,
+    ModelVolumeMount,
+    Resources,
+    Worker,
+    Dimensions,
+    AutoScalingSpec,
+    CloudWatchTrigger,
+    IntelligentRoutingSpec,
+    KvCacheSpec,
+    L2CacheSpec,
+)
+from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
+from sagemaker.hyperpod.common.config.metadata import Metadata
+
+
+class FlatHPEndpoint(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    namespace: Optional[str] = Field(
+        default=None, description="Kubernetes namespace", min_length=1
+    )
+
+    metadata_name: Optional[str] = Field(
+        None,
+        alias="metadata_name",
+        description="Name of the custom endpoint object",
+        max_length=63,
+        pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+    )
+
+    # endpoint_name
+    endpoint_name: Optional[str] = Field(
+        None,
+        alias="endpoint_name",
+        description="Name of SageMaker endpoint; empty string means no creation",
+        max_length=63,
+        pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+    )
+
+    # Environment variables map
+    env: Optional[Dict[str, str]] = Field(
+        None,
+        alias="env",
+        description="Map of environment variable names to their values",
+    )
+
+    instance_type: str = Field(
+        ...,
+        alias="instance_type",
+        description="EC2 instance type for the inference server",
+        pattern=r"^ml\..*",
+    )
+
+    # metrics.*
+    metrics_enabled: Optional[bool] = Field(
+        False,
+        alias="metrics_enabled",
+        description="Enable metrics collection",
+    )
+
+    # model_name and version
+    model_name: str = Field(
+        ...,
+        alias="model_name",
+        description="Name of model to create on SageMaker",
+        min_length=1,
+        max_length=63,
+        pattern=r"^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+    )
+
+    model_version: Optional[str] = Field(
+        None,
+        alias="model_version",
+        description="Version of the model for the endpoint",
+        min_length=5,
+        max_length=14,
+        pattern=r"^\d{1,4}\.\d{1,4}\.\d{1,4}$",
+    )
+
+    # model_source_config.*
+    model_source_type: Literal["fsx", "s3"] = Field(
+        ...,
+        alias="model_source_type",
+        description="Source type: fsx or s3",
+    )
+    model_location: Optional[str] = Field(
+        None,
+        alias="model_location",
+        description="Specific model data location",
+    )
+    prefetch_enabled: Optional[bool] = Field(
+        False,
+        alias="prefetch_enabled",
+        description="Whether to pre-fetch model data",
+    )
+
+    # tls_config
+    tls_certificate_output_s3_uri: Optional[str] = Field(
+        None,
+        alias="tls_certificate_output_s3_uri",
+        description="S3 URI for TLS certificate output",
+        pattern=r"^s3://([^/]+)/?(.*)$",
+    )
+
+    # worker.*
+    image_uri: str = Field(
+        ...,
+        alias="image_uri",
+        description="Inference server image name",
+    )
+    container_port: int = Field(
+        ...,
+        alias="container_port",
+        description="Port on which the model server listens",
+        ge=1,
+        le=65535,
+    )
+    model_volume_mount_path: Optional[str] = Field(
+        "/opt/ml/model",
+        alias="model_volume_mount_path",
+        description="Path inside container for model volume",
+    )
+    model_volume_mount_name: str = Field(
+        ...,
+        alias="model_volume_mount_name",
+        description="Name of the model volume mount",
+    )
+
+    # FSXStorage
+    fsx_dns_name: Optional[str] = Field(
+        None,
+        alias="fsx_dns_name",
+        description="FSX File System DNS Name",
+    )
+    fsx_file_system_id: Optional[str] = Field(
+        None,
+        alias="fsx_file_system_id",
+        description="FSX File System ID",
+    )
+    fsx_mount_name: Optional[str] = Field(
+        None,
+        alias="fsx_mount_name",
+        description="FSX File System Mount Name",
+    )
+
+    # S3Storage
+    s3_bucket_name: Optional[str] = Field(
+        None,
+        alias="s3_bucket_name",
+        description="S3 bucket location",
+    )
+    s3_region: Optional[str] = Field(
+        None,
+        alias="s3_region",
+        description="S3 bucket region",
+    )
+
+    # Resources
+    resources_limits: Optional[Dict[str, Union[int, str]]] = Field(
+        None,
+        alias="resources_limits",
+        description="Resource limits for the worker",
+    )
+    resources_requests: Optional[Dict[str, Union[int, str]]] = Field(
+        None,
+        alias="resources_requests",
+        description="Resource requests for the worker",
+    )
+
+    # Dimensions
+    dimensions: Optional[Dict[str, str]] = Field(
+        None,
+        alias="dimensions",
+        description="CloudWatch Metric dimensions as key–value pairs",
+    )
+
+    # CloudWatch Trigger
+    metric_collection_period: Optional[int] = Field(
+        300, description="Defines the Period for CloudWatch query"
+    )
+    metric_collection_start_time: Optional[int] = Field(
+        300, description="Defines the StartTime for CloudWatch query"
+    )
+    metric_name: Optional[str] = Field(
+        None, description="Metric name to query for CloudWatch trigger"
+    )
+    metric_stat: Optional[str] = Field(
+        "Average",
+        description=(
+            "Statistics metric to be used by Trigger. "
+            "Defines the Stat for the CloudWatch query. Default is Average."
+        ),
+    )
+    metric_type: Optional[Literal["Value", "Average"]] = Field(
+        "Average",
+        description=(
+            "The type of metric to be used by HPA. "
+            "`Average` – Uses average value per pod; "
+            "`Value` – Uses absolute metric value."
+        ),
+    )
+    min_value: Optional[float] = Field(
+        0,
+        description=(
+            "Minimum metric value used in case of empty response "
+            "from CloudWatch. Default is 0."
+        ),
+    )
+    cloud_watch_trigger_name: Optional[str] = Field(
+        None, description="Name for the CloudWatch trigger"
+    )
+    cloud_watch_trigger_namespace: Optional[str] = Field(
+        None, description="AWS CloudWatch namespace for the metric"
+    )
+    target_value: Optional[float] = Field(
+        None, description="Target value for the CloudWatch metric"
+    )
+    use_cached_metrics: Optional[bool] = Field(
+        True,
+        description=(
+            "Enable caching of metric values during polling interval. "
+            "Default is true."
+        ),
+    )
+
+    invocation_endpoint: Optional[str] = Field(
+        default="invocations",
+        description=(
+            "The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields. "
+            "Please fill in the path after http://<host>:<port>/ specific to your model server.",
+        ),
+    )
+
+    # Intelligent Routing flattened fields
+    intelligent_routing_enabled: Optional[bool] = Field(
+        None,
+        alias="intelligent_routing_enabled",
+        description="Enable intelligent routing",
+    )
+    routing_strategy: Optional[
+        Literal["prefixaware", "kvaware", "session", "roundrobin"]
+    ] = Field(
+        None,
+        alias="routing_strategy",
+        description="Routing strategy for intelligent routing",
+    )
+
+    # KV Cache flattened fields
+    enable_l1_cache: Optional[bool] = Field(
+        None,
+        alias="enable_l1_cache",
+        description="Enable L1 cache (CPU offloading)",
+    )
+    enable_l2_cache: Optional[bool] = Field(
+        None,
+        alias="enable_l2_cache",
+        description="Enable L2 cache",
+    )
+    l2_cache_backend: Optional[str] = Field(
+        None,
+        alias="l2_cache_backend",
+        description="L2 cache backend type",
+    )
+    l2_cache_local_url: Optional[str] = Field(
+        None,
+        alias="l2_cache_local_url",
+        description="L2 cache URL to local storage",
+    )
+    cache_config_file: Optional[str] = Field(
+        None,
+        alias="cache_config_file",
+        description="KV cache configuration file path",
+    )
+
+    @model_validator(mode="after")
+    def validate_model_source_config(self):
+        """Validate that required fields are provided based on model_source_type"""
+        if self.model_source_type == "s3":
+            if not self.s3_bucket_name or not self.s3_region:
+                raise ValueError(
+                    "s3_bucket_name and s3_region are required when model_source_type is 's3'"
+                )
+        elif self.model_source_type == "fsx":
+            if not self.fsx_file_system_id:
+                raise ValueError(
+                    "fsx_file_system_id is required when model_source_type is 'fsx'"
+                )
+        return self
+
+    @model_validator(mode="after")
+    def validate_name(self):
+        if not self.metadata_name and not self.endpoint_name:
+            raise ValueError("Either metadata_name or endpoint_name must be provided")
+        return self
+
+    def to_domain(self) -> HPEndpoint:
+        if self.endpoint_name and not self.metadata_name:
+            self.metadata_name = self.endpoint_name
+
+        metadata = Metadata(name=self.metadata_name, namespace=self.namespace)
+
+        env_vars = None
+        if self.env:
+            env_vars = [
+                EnvironmentVariables(name=k, value=v) for k, v in self.env.items()
+            ]
+
+        dim_vars: list[Dimensions] = []
+        if self.dimensions:
+            for name, value in self.dimensions.items():
+                dim_vars.append(Dimensions(name=name, value=value))
+
+        cloud_watch_trigger = CloudWatchTrigger(
+            dimensions=dim_vars,
+            metric_collection_period=self.metric_collection_period,
+            metric_collection_start_time=self.metric_collection_start_time,
+            metric_name=self.metric_name,
+            metric_stat=self.metric_stat,
+            metric_type=self.metric_type,
+            min_value=self.min_value,
+            name=self.cloud_watch_trigger_name,
+            namespace=self.cloud_watch_trigger_namespace,
+            target_value=self.target_value,
+            use_cached_metrics=self.use_cached_metrics,
+        )
+
+        auto_scaling_spec = AutoScalingSpec(cloud_watch_trigger=cloud_watch_trigger)
+
+        # nested metrics
+        metrics = Metrics(
+            enabled=self.metrics_enabled,
+        )
+
+        # Validate storage choice and build nested storage config
+        if self.model_source_type == "s3":
+            s3 = S3Storage(
+                bucket_name=self.s3_bucket_name,
+                region=self.s3_region,
+            )
+            fsx = None
+        elif self.model_source_type == "fsx":
+            fsx = FsxStorage(
+                dns_name=self.fsx_dns_name,
+                file_system_id=self.fsx_file_system_id,
+                mount_name=self.fsx_mount_name,
+            )
+            s3 = None
+        else:
+            raise ValueError(f"Unsupported model_source_type: {self.model_source_type}")
+
+        source = ModelSourceConfig(
+            model_location=self.model_location,
+            model_source_type=self.model_source_type,
+            prefetch_enabled=self.prefetch_enabled,
+            s3_storage=s3,
+            fsx_storage=fsx,
+        )
+
+        tls = TlsConfig(
+            tls_certificate_output_s3_uri=self.tls_certificate_output_s3_uri
+        )
+
+        invocation_port = ModelInvocationPort(
+            container_port=self.container_port,
+        )
+        volume_mount = ModelVolumeMount(
+            mount_path=self.model_volume_mount_path,
+            name=self.model_volume_mount_name,
+        )
+        resources = Resources(
+            limits=self.resources_limits,
+            requests=self.resources_requests,
+        )
+        worker = Worker(
+            environment_variables=env_vars,
+            image=self.image_uri,
+            model_invocation_port=invocation_port,
+            model_volume_mount=volume_mount,
+            resources=resources,
+        )
+        # Build intelligent routing spec from flattened fields
+        intelligent_routing_spec = None
+        if self.intelligent_routing_enabled is not None:
+            intelligent_routing_spec = IntelligentRoutingSpec(
+                enabled=self.intelligent_routing_enabled,
+                routing_strategy=self.routing_strategy,
+            )
+
+        # Build KV cache spec from flattened fields
+        kv_cache_spec = None
+        if any([self.enable_l1_cache, self.enable_l2_cache, self.cache_config_file]):
+            l2_cache_spec = None
+            if self.l2_cache_backend or self.l2_cache_local_url:
+                l2_cache_spec = L2CacheSpec(
+                    l2_cache_backend=self.l2_cache_backend,
+                    l2_cache_local_url=self.l2_cache_local_url,
+                )
+
+            kv_cache_spec = KvCacheSpec(
+                enable_l1_cache=self.enable_l1_cache,
+                enable_l2_cache=self.enable_l2_cache,
+                l2_cache_spec=l2_cache_spec,
+                cache_config_file=self.cache_config_file,
+            )
+
+        return HPEndpoint(
+            metadata=metadata,
+            endpoint_name=self.endpoint_name,
+            instance_type=self.instance_type,
+            metrics=metrics,
+            model_name=self.model_name,
+            model_source_config=source,
+            model_version=self.model_version,
+            tls_config=tls,
+            worker=worker,
+            invocation_endpoint=self.invocation_endpoint,
+            auto_scaling_spec=auto_scaling_spec,
+            intelligent_routing_spec=intelligent_routing_spec,
+            kv_cache_spec=kv_cache_spec,
+        )
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json
new file mode 100644
index 00000000..89af6406
--- /dev/null
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/schema.json
@@ -0,0 +1,568 @@
+{
+  "additionalProperties": false,
+  "properties": {
+    "namespace": {
+      "anyOf": [
+        {
+          "minLength": 1,
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Kubernetes namespace",
+      "title": "Namespace"
+    },
+    "metadata_name": {
+      "anyOf": [
+        {
+          "maxLength": 63,
+          "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Name of the custom endpoint object",
+      "title": "Metadata Name"
+    },
+    "endpoint_name": {
+      "anyOf": [
+        {
+          "maxLength": 63,
+          "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Name of SageMaker endpoint; empty string means no creation",
+      "title": "Endpoint Name"
+    },
+    "env": {
+      "anyOf": [
+        {
+          "additionalProperties": {
+            "type": "string"
+          },
+          "type": "object"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Map of environment variable names to their values",
+      "title": "Env"
+    },
+    "instance_type": {
+      "description": "EC2 instance type for the inference server",
+      "pattern": "^ml\\..*",
+      "title": "Instance Type",
+      "type": "string"
+    },
+    "metrics_enabled": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": false,
+      "description": "Enable metrics collection",
+      "title": "Metrics Enabled"
+    },
+    "model_name": {
+      "description": "Name of model to create on SageMaker",
+      "maxLength": 63,
+      "minLength": 1,
+      "pattern": "^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}$",
+      "title": "Model Name",
+      "type": "string"
+    },
+    "model_version": {
+      "anyOf": [
+        {
+          "maxLength": 14,
+          "minLength": 5,
+          "pattern": "^\\d{1,4}\\.\\d{1,4}\\.\\d{1,4}$",
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Version of the model for the endpoint",
+      "title": "Model Version"
+    },
+    "model_source_type": {
+      "description": "Source type: fsx or s3",
+      "enum": [
+        "fsx",
+        "s3"
+      ],
+      "title": "Model Source Type",
+      "type": "string"
+    },
+    "model_location": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Specific model data location",
+      "title": "Model Location"
+    },
+    "prefetch_enabled": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": false,
+      "description": "Whether to pre-fetch model data",
+      "title": "Prefetch Enabled"
+    },
+    "tls_certificate_output_s3_uri": {
+      "anyOf": [
+        {
+          "pattern": "^s3://([^/]+)/?(.*)$",
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "S3 URI for TLS certificate output",
+      "title": "Tls Certificate Output S3 Uri"
+    },
+    "image_uri": {
+      "description": "Inference server image name",
+      "title": "Image Uri",
+      "type": "string"
+    },
+    "container_port": {
+      "description": "Port on which the model server listens",
+      "maximum": 65535,
+      "minimum": 1,
+      "title": "Container Port",
+      "type": "integer"
+    },
+    "model_volume_mount_path": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": "/opt/ml/model",
+      "description": "Path inside container for model volume",
+      "title": "Model Volume Mount Path"
+    },
+    "model_volume_mount_name": {
+      "description": "Name of the model volume mount",
+      "title": "Model Volume Mount Name",
+      "type": "string"
+    },
+    "fsx_dns_name": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "FSX File System DNS Name",
+      "title": "Fsx Dns Name"
+    },
+    "fsx_file_system_id": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "FSX File System ID",
+      "title": "Fsx File System Id"
+    },
+    "fsx_mount_name": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "FSX File System Mount Name",
+      "title": "Fsx Mount Name"
+    },
+    "s3_bucket_name": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "S3 bucket location",
+      "title": "S3 Bucket Name"
+    },
+    "s3_region": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "S3 bucket region",
+      "title": "S3 Region"
+    },
+    "resources_limits": {
+      "anyOf": [
+        {
+          "additionalProperties": {
+            "anyOf": [
+              {
+                "type": "integer"
+              },
+              {
+                "type": "string"
+              }
+            ]
+          },
+          "type": "object"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Resource limits for the worker",
+      "title": "Resources Limits"
+    },
+    "resources_requests": {
+      "anyOf": [
+        {
+          "additionalProperties": {
+            "anyOf": [
+              {
+                "type": "integer"
+              },
+              {
+                "type": "string"
+              }
+            ]
+          },
+          "type": "object"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Resource requests for the worker",
+      "title": "Resources Requests"
+    },
+    "dimensions": {
+      "anyOf": [
+        {
+          "additionalProperties": {
+            "type": "string"
+          },
+          "type": "object"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "CloudWatch Metric dimensions as key\u2013value pairs",
+      "title": "Dimensions"
+    },
+    "metric_collection_period": {
+      "anyOf": [
+        {
+          "type": "integer"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": 300,
+      "description": "Defines the Period for CloudWatch query",
+      "title": "Metric Collection Period"
+    },
+    "metric_collection_start_time": {
+      "anyOf": [
+        {
+          "type": "integer"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": 300,
+      "description": "Defines the StartTime for CloudWatch query",
+      "title": "Metric Collection Start Time"
+    },
+    "metric_name": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Metric name to query for CloudWatch trigger",
+      "title": "Metric Name"
+    },
+    "metric_stat": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": "Average",
+      "description": "Statistics metric to be used by Trigger. Defines the Stat for the CloudWatch query. Default is Average.",
+      "title": "Metric Stat"
+    },
+    "metric_type": {
+      "anyOf": [
+        {
+          "enum": [
+            "Value",
+            "Average"
+          ],
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": "Average",
+      "description": "The type of metric to be used by HPA. `Average` \u2013 Uses average value per pod; `Value` \u2013 Uses absolute metric value.",
+      "title": "Metric Type"
+    },
+    "min_value": {
+      "anyOf": [
+        {
+          "type": "number"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": 0,
+      "description": "Minimum metric value used in case of empty response from CloudWatch. Default is 0.",
+      "title": "Min Value"
+    },
+    "cloud_watch_trigger_name": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Name for the CloudWatch trigger",
+      "title": "Cloud Watch Trigger Name"
+    },
+    "cloud_watch_trigger_namespace": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "AWS CloudWatch namespace for the metric",
+      "title": "Cloud Watch Trigger Namespace"
+    },
+    "target_value": {
+      "anyOf": [
+        {
+          "type": "number"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Target value for the CloudWatch metric",
+      "title": "Target Value"
+    },
+    "use_cached_metrics": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": true,
+      "description": "Enable caching of metric values during polling interval. Default is true.",
+      "title": "Use Cached Metrics"
+    },
+    "invocation_endpoint": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": "invocations",
+      "description": "The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields. Please fill in the path after http://<host>:<port>/ specific to your model server.",
+      "title": "Invocation Endpoint"
+    },
+    "intelligent_routing_enabled": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Enable intelligent routing",
+      "title": "Intelligent Routing Enabled"
+    },
+    "routing_strategy": {
+      "anyOf": [
+        {
+          "enum": [
+            "prefixaware",
+            "kvaware",
+            "session",
+            "roundrobin"
+          ],
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Routing strategy for intelligent routing",
+      "title": "Routing Strategy"
+    },
+    "enable_l1_cache": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Enable L1 cache (CPU offloading)",
+      "title": "Enable L1 Cache"
+    },
+    "enable_l2_cache": {
+      "anyOf": [
+        {
+          "type": "boolean"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "Enable L2 cache",
+      "title": "Enable L2 Cache"
+    },
+    "l2_cache_backend": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "L2 cache backend type",
+      "title": "L2 Cache Backend"
+    },
+    "l2_cache_local_url": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "L2 cache URL to local storage",
+      "title": "L2 Cache Local Url"
+    },
+    "cache_config_file": {
+      "anyOf": [
+        {
+          "type": "string"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": null,
+      "description": "KV cache configuration file path",
+      "title": "Cache Config File"
+    }
+  },
+  "required": [
+    "instance_type",
+    "model_name",
+    "model_source_type",
+    "image_uri",
+    "container_port",
+    "model_volume_mount_name"
+  ],
+  "title": "FlatHPEndpoint",
+  "type": "object"
+}
diff --git a/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py
new file mode 100644
index 00000000..ef4be13d
--- /dev/null
+++ b/hyperpod-custom-inference-template/hyperpod_custom_inference_template/v1_1/template.py
@@ -0,0 +1,100 @@
+TEMPLATE_CONTENT = """
+apiVersion: hyperpod.sagemaker.aws/v1
+kind: InferenceEndpointConfig
+metadata:
+  name: {{ metadata_name or endpoint_name }}
+  namespace: {{ namespace }}
+spec:
+  endpointName: {{ endpoint_name }}
+  instanceType: {{ instance_type }}
+  modelName: {{ model_name }}
+  modelVersion: {{ model_version or "" }}
+  
+  metrics:
+    enabled: {{ metrics_enabled or False }}
+  
+  modelSourceConfig:
+    modelSourceType: {{ model_source_type }}
+    modelLocation: {{ model_location or "" }}
+    prefetchEnabled: {{ prefetch_enabled or False }}
+{%- if model_source_type == "s3" %}
+    s3Storage:
+      bucketName: {{ s3_bucket_name }}
+      region: {{ s3_region }}
+{%- elif model_source_type == "fsx" %}
+    fsxStorage:
+      dnsName: {{ fsx_dns_name }}
+      fileSystemId: {{ fsx_file_system_id }}
+      mountName: {{ fsx_mount_name or "" }}
+{%- endif %}
+  
+  tlsConfig:
+    tlsCertificateOutputS3Uri: {{ tls_certificate_output_s3_uri or "" }}
+
+  worker:
+    environmentVariables:
+  {%- if env %}
+  {%- for key, val in env.items() %}
+      - name: {{ key }}
+        value: "{{ val }}"
+  {%- endfor %}
+  {%- else %}
+      []
+  {%- endif %}
+    image: {{ image_uri }}
+    modelInvocationPort:
+      containerPort: {{ container_port }}
+    modelVolumeMount:
+      name: {{ model_volume_mount_name }}
+      mountPath: {{ model_volume_mount_path }}
+    resources:
+{%- if resources_limits %}
+      limits:
+{%-   for key, val in resources_limits.items() %}
+        {{ key }}: {{ val }}
+{%-   endfor %}
+{%- else %}
+      {}
+{%- endif %}
+{%- if resources_requests %}
+      requests:
+{%-   for key, val in resources_requests.items() %}
+        {{ key }}: {{ val }}
+{%-   endfor %}
+{%- endif %}
+
+  autoScalingSpec:
+    cloudWatchTrigger:
+{%- if dimensions %}
+      dimensions:
+{%-   for dim_key, dim_val in dimensions.items() %}
+        - name: {{ dim_key }}
+          value: {{ dim_val }}
+{%-   endfor %}
+{%- endif %}
+      metricCollectionPeriod: {{ metric_collection_period }}
+      metricCollectionStartTime: {{ metric_collection_start_time }}
+      metricName: {{ metric_name or "" }}
+      metricStat: {{ metric_stat }}
+      metricType: {{ metric_type }}
+      minValue: {{ min_value }}
+      name: {{ cloud_watch_trigger_name or "" }}
+      namespace: {{ cloud_watch_trigger_namespace or "" }}
+      targetValue: {{ target_value or "" }}
+      useCachedMetrics: {{ use_cached_metrics or False }}
+      
+  invocationEndpoint: "{{ invocation_endpoint }}"
+
+{% if intelligent_routing_enabled is not none %}  intelligentRoutingSpec:
+    enabled: {{ intelligent_routing_enabled }}
+{% if routing_strategy is not none %}    routingStrategy: "{{ routing_strategy }}"{% endif %}{% endif %}
+{% if enable_l1_cache is not none or enable_l2_cache is not none or cache_config_file is not none %}  kvCacheSpec:
+{% if enable_l1_cache is not none %}    enableL1Cache: {{ enable_l1_cache }}{% endif %}
+{% if enable_l2_cache is not none %}    enableL2Cache: {{ enable_l2_cache }}{% endif %}
+{% if l2_cache_backend is not none or l2_cache_local_url is not none %}    l2CacheSpec:
+{% if l2_cache_backend is not none %}      l2CacheBackend: "{{ l2_cache_backend }}"{% endif %}
+{% if l2_cache_local_url is not none %}      l2CacheLocalUrl: "{{ l2_cache_local_url }}"{% endif %}
+{% endif %}
+{% if cache_config_file is not none %}    cacheConfigFile: "{{ cache_config_file }}"{% endif %}
+{% endif %}
+"""
\ No newline at end of file
diff --git a/src/sagemaker/hyperpod/inference/config/constants.py b/src/sagemaker/hyperpod/inference/config/constants.py
index 0f166b1e..60e4542b 100644
--- a/src/sagemaker/hyperpod/inference/config/constants.py
+++ b/src/sagemaker/hyperpod/inference/config/constants.py
@@ -2,9 +2,9 @@
 DEFAULT_MAX_DEPLOY_TIME_IN_SECONDS = 3600
 DEFAULT_MODEL_METRIC_PATH = "/metrics"
 DEFAULT_METRICS_SCRAPE_INTERBAL_SECONDS = 15
-INFERENCE_FULL_API_VERSION = "inference.sagemaker.aws.amazon.com/v1alpha1"
+INFERENCE_FULL_API_VERSION = "inference.sagemaker.aws.amazon.com/v1"
 INFERENCE_GROUP = "inference.sagemaker.aws.amazon.com"
-INFERENCE_API_VERSION = "v1alpha1"
+INFERENCE_API_VERSION = "v1"
 JUMPSTART_MODEL_KIND = "JumpStartModel"
 JUMPSTART_MODEL_PLURAL = "jumpstartmodels"
 INFERENCE_ENDPOINT_CONFIG_KIND = "InferenceEndpointConfig"
diff --git a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py
index 8baf23de..33471286 100644
--- a/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py
+++ b/src/sagemaker/hyperpod/inference/config/hp_endpoint_config.py
@@ -70,6 +70,65 @@ class CloudWatchTrigger(BaseModel):
     )
 
 
+class CloudWatchTriggerList(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    activationTargetValue: Optional[float] = Field(
+        default=0,
+        alias="activation_target_value",
+        description="Activation Value for CloudWatch metric to scale from 0 to 1. Only applicable if minReplicaCount = 0",
+    )
+    dimensions: Optional[List[Dimensions]] = Field(
+        default=None, description="Dimensions for Cloudwatch metrics"
+    )
+    metricCollectionPeriod: Optional[int] = Field(
+        default=300,
+        alias="metric_collection_period",
+        description="Defines the Period for CloudWatch query",
+    )
+    metricCollectionStartTime: Optional[int] = Field(
+        default=300,
+        alias="metric_collection_start_time",
+        description="Defines the StartTime for CloudWatch query",
+    )
+    metricName: Optional[str] = Field(
+        default=None,
+        alias="metric_name",
+        description="Metric name to query for Cloudwatch trigger",
+    )
+    metricStat: Optional[str] = Field(
+        default="Average",
+        alias="metric_stat",
+        description="Statistics metric to be used by Trigger. Used to define Stat for CloudWatch query. Default is Average.",
+    )
+    metricType: Optional[Literal["Value", "Average"]] = Field(
+        default="Average",
+        alias="metric_type",
+        description="The type of metric to be used by HPA. Enum: AverageValue - Uses average value of metric per pod, Value - Uses absolute metric value",
+    )
+    minValue: Optional[float] = Field(
+        default=0,
+        alias="min_value",
+        description="Minimum metric value used in case of empty response from CloudWatch. Default is 0.",
+    )
+    name: Optional[str] = Field(
+        default=None, description="Name for the CloudWatch trigger"
+    )
+    namespace: Optional[str] = Field(
+        default=None, description="AWS CloudWatch namespace for metric"
+    )
+    targetValue: Optional[float] = Field(
+        default=None,
+        alias="target_value",
+        description="TargetValue for CloudWatch metric",
+    )
+    useCachedMetrics: Optional[bool] = Field(
+        default=True,
+        alias="use_cached_metrics",
+        description="Enable caching of metric values during polling interval. Default is true",
+    )
+
+
 class PrometheusTrigger(BaseModel):
     """Prometheus metric trigger to use for autoscaling"""
 
@@ -116,6 +175,50 @@ class PrometheusTrigger(BaseModel):
     )
 
 
+class PrometheusTriggerList(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    activationTargetValue: Optional[float] = Field(
+        default=0,
+        alias="activation_target_value",
+        description="Activation Value for Prometheus metric to scale from 0 to 1. Only applicable if minReplicaCount = 0",
+    )
+    customHeaders: Optional[str] = Field(
+        default=None,
+        alias="custom_headers",
+        description="Custom headers to include while querying the prometheus endpoint.",
+    )
+    metricType: Optional[Literal["Value", "Average"]] = Field(
+        default="Average",
+        alias="metric_type",
+        description="The type of metric to be used by HPA. Enum: AverageValue - Uses average value of metric per pod, Value - Uses absolute metric value",
+    )
+    name: Optional[str] = Field(
+        default=None, description="Name for the Prometheus trigger"
+    )
+    namespace: Optional[str] = Field(
+        default=None, description="Namespace for namespaced queries"
+    )
+    query: Optional[str] = Field(
+        default=None, description="PromQLQuery for the metric."
+    )
+    serverAddress: Optional[str] = Field(
+        default=None,
+        alias="server_address",
+        description="Server address for AMP workspace",
+    )
+    targetValue: Optional[float] = Field(
+        default=None,
+        alias="target_value",
+        description="Target metric value for scaling",
+    )
+    useCachedMetrics: Optional[bool] = Field(
+        default=True,
+        alias="use_cached_metrics",
+        description="Enable caching of metric values during polling interval. Default is true",
+    )
+
+
 class AutoScalingSpec(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
@@ -124,6 +227,11 @@ class AutoScalingSpec(BaseModel):
         alias="cloud_watch_trigger",
         description="CloudWatch metric trigger to use for autoscaling",
     )
+    cloudWatchTriggerList: Optional[List[CloudWatchTriggerList]] = Field(
+        default=None,
+        alias="cloud_watch_trigger_list",
+        description="Multiple CloudWatch metric triggers to use for autoscaling. Takes priority over CloudWatchTrigger if both are provided.",
+    )
     cooldownPeriod: Optional[int] = Field(
         default=300,
         alias="cooldown_period",
@@ -154,6 +262,11 @@ class AutoScalingSpec(BaseModel):
         alias="prometheus_trigger",
         description="Prometheus metric trigger to use for autoscaling",
     )
+    prometheusTriggerList: Optional[List[PrometheusTriggerList]] = Field(
+        default=None,
+        alias="prometheus_trigger_list",
+        description="Multiple Prometheus metric triggers to use for autoscaling. Takes priority over PrometheusTrigger if both are provided.",
+    )
     scaleDownStabilizationTime: Optional[int] = Field(
         default=300,
         alias="scale_down_stabilization_time",
@@ -166,6 +279,79 @@ class AutoScalingSpec(BaseModel):
     )
 
 
+class IntelligentRoutingSpec(BaseModel):
+    """Configuration for intelligent routing This feature is currently not supported for existing deployments. Adding this configuration to an existing deployment will be rejected."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    autoScalingSpec: Optional[AutoScalingSpec] = Field(
+        default=None, alias="auto_scaling_spec"
+    )
+    enabled: Optional[bool] = Field(
+        default=False, description="Once set, the enabled field cannot be modified"
+    )
+    routingStrategy: Optional[
+        Literal["prefixaware", "kvaware", "session", "roundrobin"]
+    ] = Field(default="prefixaware", alias="routing_strategy")
+
+
+class L2CacheSpec(BaseModel):
+    """Configuration for providing L2 Cache offloading"""
+
+    model_config = ConfigDict(extra="forbid")
+
+    l2CacheBackend: Optional[str] = Field(
+        default=None,
+        alias="l2_cache_backend",
+        description="L2 cache backend type. Required when L2CacheSpec is provided.",
+    )
+    l2CacheLocalUrl: Optional[str] = Field(
+        default=None,
+        alias="l2_cache_local_url",
+        description="Provide the L2 cache URL to local storage",
+    )
+
+
+class KvCacheSpec(BaseModel):
+    """Configuration for KV Cache specification By default L1CacheOffloading will be enabled"""
+
+    model_config = ConfigDict(extra="forbid")
+
+    cacheConfigFile: Optional[str] = Field(
+        default=None,
+        alias="cache_config_file",
+        description="KVCache configuration file path. If specified, override other configurations provided via spec",
+    )
+    enableL1Cache: Optional[bool] = Field(
+        default=True, alias="enable_l1_cache", description="Enable CPU offloading"
+    )
+    enableL2Cache: Optional[bool] = Field(default=False, alias="enable_l2_cache")
+    l2CacheSpec: Optional[L2CacheSpec] = Field(
+        default=None,
+        alias="l2_cache_spec",
+        description="Configuration for providing L2 Cache offloading",
+    )
+
+
+class LoadBalancer(BaseModel):
+    """Configuration for Application Load Balancer"""
+
+    model_config = ConfigDict(extra="forbid")
+
+    healthCheckPath: Optional[str] = Field(
+        default="/ping",
+        alias="health_check_path",
+        description="Health check path for the ALB target group. Defaults to /ping if not specified.",
+    )
+    routingAlgorithm: Optional[Literal["least_outstanding_requests", "round_robin"]] = (
+        Field(
+            default="least_outstanding_requests",
+            alias="routing_algorithm",
+            description="Routing algorithm for the ALB target group (least_oustanding_requests or round_robin)",
+        )
+    )
+
+
 class ModelMetrics(BaseModel):
     """Configuration for model container metrics scraping"""
 
@@ -433,6 +619,13 @@ class Worker(BaseModel):
 
     model_config = ConfigDict(extra="forbid")
 
+    args: Optional[List[str]] = Field(
+        default=None, description="Defines the Arguments to the entrypoint."
+    )
+    command: Optional[List[str]] = Field(
+        default=None,
+        description="Defines the Command which is Entrypoint array. Not executed within a shell.",
+    )
     environmentVariables: Optional[List[EnvironmentVariables]] = Field(
         default=None,
         alias="environment_variables",
@@ -450,6 +643,11 @@ class Worker(BaseModel):
     resources: Resources = Field(
         description="Defines the Resources in terms of CPU, GPU, Memory needed for the model to be deployed"
     )
+    workingDir: Optional[str] = Field(
+        default=None,
+        alias="working_dir",
+        description="Defines the working directory of container.",
+    )
 
 
 class _HPEndpoint(BaseModel):
@@ -468,16 +666,31 @@ class _HPEndpoint(BaseModel):
     endpointName: Optional[str] = Field(
         default=None,
         alias="endpoint_name",
-        description="Name of a SageMaker endpoint to be created for this InferenceEndpointConfig. The default value of empty string, when used, will skip endpoint creation.",
+        description="Name used for Sagemaker Endpoint Name of sagemaker endpoint. Defaults to empty string which represents that Sagemaker endpoint will not be created.",
     )
     instanceType: str = Field(
         alias="instance_type", description="Instance Type to deploy the model on"
     )
+    intelligentRoutingSpec: Optional[IntelligentRoutingSpec] = Field(
+        default=None,
+        alias="intelligent_routing_spec",
+        description="Configuration for intelligent routing This feature is currently not supported for existing deployments. Adding this configuration to an existing deployment will be rejected.",
+    )
     invocationEndpoint: Optional[str] = Field(
         default="invocations",
         alias="invocation_endpoint",
         description="The invocation endpoint of the model server. http://<host>:<port>/ would be pre-populated based on the other fields. Please fill in the path after http://<host>:<port>/ specific to your model server.",
     )
+    kvCacheSpec: Optional[KvCacheSpec] = Field(
+        default=None,
+        alias="kv_cache_spec",
+        description="Configuration for KV Cache specification By default L1CacheOffloading will be enabled",
+    )
+    loadBalancer: Optional[LoadBalancer] = Field(
+        default=None,
+        alias="load_balancer",
+        description="Configuration for Application Load Balancer",
+    )
     metrics: Optional[Metrics] = Field(
         default=None, description="Configuration for metrics collection and exposure"
     )
@@ -641,7 +854,7 @@ class Endpoints(BaseModel):
     )
 
 
-class ModelMetrics(BaseModel):
+class ModelMetricsStatus(BaseModel):
     """Status of model container metrics collection"""
 
     model_config = ConfigDict(extra="forbid")
@@ -670,7 +883,7 @@ class MetricsStatus(BaseModel):
         alias="metrics_scrape_interval_seconds",
         description="Scrape interval in seconds for metrics collection from sidecar and model container.",
     )
-    modelMetrics: Optional[ModelMetrics] = Field(
+    modelMetrics: Optional[ModelMetricsStatus] = Field(
         default=None,
         alias="model_metrics",
         description="Status of model container metrics collection",
@@ -773,4 +986,4 @@ class InferenceEndpointConfigStatus(BaseModel):
         default=None,
         alias="tls_certificate",
         description="CertificateStatus represents the status of TLS certificates",
-    )
+    )
\ No newline at end of file
diff --git a/test/unit_tests/cli/test_inference.py b/test/unit_tests/cli/test_inference.py
index 4f21b405..c9e3e695 100644
--- a/test/unit_tests/cli/test_inference.py
+++ b/test/unit_tests/cli/test_inference.py
@@ -9,31 +9,45 @@
 
 # Import the non-create commands that don't need special handling
 from sagemaker.hyperpod.cli.commands.inference import (
-    js_create, custom_create, custom_invoke,
-    js_list, custom_list,
-    js_describe, custom_describe,
-    js_delete, custom_delete,
-    js_list_pods, custom_list_pods,
-    js_get_logs, custom_get_logs,
-    js_get_operator_logs, custom_get_operator_logs
+    js_create,
+    custom_create,
+    custom_invoke,
+    js_list,
+    custom_list,
+    js_describe,
+    custom_describe,
+    js_delete,
+    custom_delete,
+    js_list_pods,
+    custom_list_pods,
+    js_get_logs,
+    custom_get_logs,
+    js_get_operator_logs,
+    custom_get_operator_logs,
 )
 
+
 # --------- JumpStart Commands ---------
-@patch('sys.argv', ['pytest', '--version', '1.0'])
+@patch("sys.argv", ["pytest", "--version", "1.0"])
 def test_js_create_with_required_args():
     """
     Test js_create with all required options via CLI runner, mocking schema and endpoint.
     """
     # Reload the inference module with mocked sys.argv
-    if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules:
-        importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference'])
+    if "sagemaker.hyperpod.cli.commands.inference" in sys.modules:
+        importlib.reload(sys.modules["sagemaker.hyperpod.cli.commands.inference"])
 
     from sagemaker.hyperpod.cli.commands.inference import js_create
 
-    with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \
-         patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint') as mock_endpoint_class, \
-         patch('sagemaker.hyperpod.common.cli_decorators._is_valid_jumpstart_model_id') as mock_model_validation, \
-         patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists') as mock_namespace_exists:
+    with patch(
+        "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version"
+    ) as mock_load_schema, patch(
+        "sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint"
+    ) as mock_endpoint_class, patch(
+        "sagemaker.hyperpod.common.cli_decorators._is_valid_jumpstart_model_id"
+    ) as mock_model_validation, patch(
+        "sagemaker.hyperpod.common.cli_decorators._namespace_exists"
+    ) as mock_namespace_exists:
 
         # Mock enhanced error handling
         mock_model_validation.return_value = True  # Allow test model-id
@@ -43,9 +57,9 @@ def test_js_create_with_required_args():
         mock_load_schema.return_value = {
             "properties": {
                 "model_id": {"type": "string"},
-                "instance_type": {"type": "string"}
+                "instance_type": {"type": "string"},
             },
-            "required": ["model_id", "instance_type"]
+            "required": ["model_id", "instance_type"],
         }
         # Prepare mock model-to-domain mapping
         mock_model_class = Mock()
@@ -57,16 +71,24 @@ def test_js_create_with_required_args():
         mock_endpoint_class.model_construct.return_value = domain_obj
 
         jreg.SCHEMA_REGISTRY.clear()
-        jreg.SCHEMA_REGISTRY['1.0'] = mock_model_class
+        jreg.SCHEMA_REGISTRY["1.0"] = mock_model_class
 
         runner = CliRunner()
-        result = runner.invoke(js_create, [
-            '--namespace', 'test-ns',
-            '--version', '1.0',
-            '--model-id', 'test-model-id',
-            '--instance-type', 'ml.t2.micro',
-            '--endpoint-name', 'test-endpoint'
-        ])
+        result = runner.invoke(
+            js_create,
+            [
+                "--namespace",
+                "test-ns",
+                "--version",
+                "1.0",
+                "--model-id",
+                "test-model-id",
+                "--instance-type",
+                "ml.t2.micro",
+                "--endpoint-name",
+                "test-endpoint",
+            ],
+        )
 
         assert result.exit_code == 0, result.output
         domain_obj.create.assert_called_once_with(debug=False)
@@ -76,37 +98,37 @@ def test_js_create_missing_required_args():
     runner = CliRunner()
     result = runner.invoke(js_create, [])
     assert result.exit_code != 0
-    assert 'Missing option' in result.output
+    assert "Missing option" in result.output
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_list(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
     inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))]
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_list, ['--namespace', 'ns'])
+    result = runner.invoke(js_list, ["--namespace", "ns"])
     assert result.exit_code == 0
-    inst.list.assert_called_once_with('ns')
+    inst.list.assert_called_once_with("ns")
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_describe(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
     inst.get.return_value = Mock(model_dump=lambda: {"name": "e"})
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_describe, ['--name', 'n', '--namespace', 'ns'])
+    result = runner.invoke(js_describe, ["--name", "n", "--namespace", "ns"])
     assert result.exit_code == 0
-    inst.get.assert_called_once_with('n', 'ns')
+    inst.get.assert_called_once_with("n", "ns")
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_delete(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
@@ -115,38 +137,42 @@ def test_js_delete(mock_hp, mock_namespace_exists):
     inst.get.return_value = ep
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_delete, ['--name', 'n', '--namespace', 'ns'])
+    result = runner.invoke(js_delete, ["--name", "n", "--namespace", "ns"])
     assert result.exit_code == 0
     ep.delete.assert_called_once()
 
 
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_get_operator_logs(mock_hp):
     inst = Mock(get_operator_logs=Mock(return_value="ol"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_get_operator_logs, ['--since-hours', '2'])
+    result = runner.invoke(js_get_operator_logs, ["--since-hours", "2"])
     assert result.exit_code == 0
-    assert 'ol' in result.output
+    assert "ol" in result.output
 
 
 # --------- Custom Commands ---------
 
-@patch('sys.argv', ['pytest', '--version', '1.0'])
+
+@patch("sys.argv", ["pytest", "--version", "1.0"])
 def test_custom_create_with_required_args():
     """
     Test custom_create with all required options via CLI runner, mocking schema and endpoint.
     """
     # Reload the inference module with mocked sys.argv
-    if 'sagemaker.hyperpod.cli.commands.inference' in sys.modules:
-        importlib.reload(sys.modules['sagemaker.hyperpod.cli.commands.inference'])
+    if "sagemaker.hyperpod.cli.commands.inference" in sys.modules:
+        importlib.reload(sys.modules["sagemaker.hyperpod.cli.commands.inference"])
 
     from sagemaker.hyperpod.cli.commands.inference import custom_create
 
-    with patch('sagemaker.hyperpod.cli.inference_utils.load_schema_for_version') as mock_load_schema, \
-         patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint') as mock_endpoint_class:
+    with patch(
+        "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version"
+    ) as mock_load_schema, patch(
+        "sagemaker.hyperpod.cli.commands.inference.HPEndpoint"
+    ) as mock_endpoint_class:
 
-        # Mock schema loading to include storage flags
+        # Mock schema loading to include storage flags (v1.0 - no intelligent routing/KV cache)
         mock_load_schema.return_value = {
             "properties": {
                 "instance_type": {"type": "string"},
@@ -156,13 +182,18 @@ def test_custom_create_with_required_args():
                 "s3_region": {"type": "string"},
                 "image_uri": {"type": "string"},
                 "container_port": {"type": "integer"},
-                "model_volume_mount_name": {"type": "string"}
+                "model_volume_mount_name": {"type": "string"},
             },
             "required": [
-                "instance_type", "model_name", "model_source_type",
-                "s3_bucket_name", "s3_region",
-                "image_uri", "container_port", "model_volume_mount_name"
-            ]
+                "instance_type",
+                "model_name",
+                "model_source_type",
+                "s3_bucket_name",
+                "s3_region",
+                "image_uri",
+                "container_port",
+                "model_volume_mount_name",
+            ],
         }
         # Prepare mock model class
         mock_model_class = Mock()
@@ -175,21 +206,35 @@ def test_custom_create_with_required_args():
 
         # Patch the registry mapping
         creg.SCHEMA_REGISTRY.clear()
-        creg.SCHEMA_REGISTRY['1.0'] = mock_model_class
+        creg.SCHEMA_REGISTRY["1.0"] = mock_model_class
         runner = CliRunner()
-        result = runner.invoke(custom_create, [
-            '--namespace', 'test-ns',
-            '--version', '1.0',
-            '--instance-type', 'ml.t2.micro',
-            '--model-name', 'test-model',
-            '--model-source-type', 's3',
-            '--s3-bucket-name', 'test-bucket',
-            '--s3-region', 'us-west-2',
-            '--image-uri', 'test-image:latest',
-            '--container-port', '8080',
-            '--model-volume-mount-name', 'model-volume',
-            '--endpoint-name', 'test-endpoint'
-        ])
+        result = runner.invoke(
+            custom_create,
+            [
+                "--namespace",
+                "test-ns",
+                "--version",
+                "1.0",
+                "--instance-type",
+                "ml.t2.micro",
+                "--model-name",
+                "test-model",
+                "--model-source-type",
+                "s3",
+                "--s3-bucket-name",
+                "test-bucket",
+                "--s3-region",
+                "us-west-2",
+                "--image-uri",
+                "test-image:latest",
+                "--container-port",
+                "8080",
+                "--model-volume-mount-name",
+                "model-volume",
+                "--endpoint-name",
+                "test-endpoint",
+            ],
+        )
 
         assert result.exit_code == 0, result.output
         domain_obj.create.assert_called_once_with(debug=False)
@@ -199,11 +244,11 @@ def test_custom_create_missing_required_args():
     runner = CliRunner()
     result = runner.invoke(custom_create, [])
     assert result.exit_code != 0
-    assert 'Missing option' in result.output
+    assert "Missing option" in result.output
 
 
-@patch('sagemaker.hyperpod.cli.commands.inference.Endpoint.get')
-@patch('sagemaker.hyperpod.cli.commands.inference.boto3')
+@patch("sagemaker.hyperpod.cli.commands.inference.Endpoint.get")
+@patch("sagemaker.hyperpod.cli.commands.inference.boto3")
 def test_custom_invoke_success(mock_boto3, mock_endpoint_get):
     mock_endpoint = Mock()
     mock_endpoint.endpoint_status = "InService"
@@ -211,54 +256,53 @@ def test_custom_invoke_success(mock_boto3, mock_endpoint_get):
 
     mock_body = Mock()
     mock_body.read.return_value.decode.return_value = '{"ok": true}'
-    mock_boto3.client.return_value.invoke_endpoint.return_value = {'Body': mock_body}
+    mock_boto3.client.return_value.invoke_endpoint.return_value = {"Body": mock_body}
 
     runner = CliRunner()
-    result = runner.invoke(custom_invoke, [
-        '--endpoint-name', 'ep',
-        '--body', '{"x": 1}'
-    ])
+    result = runner.invoke(
+        custom_invoke, ["--endpoint-name", "ep", "--body", '{"x": 1}']
+    )
 
     assert result.exit_code == 0, result.output
     assert '"ok": true' in result.output
 
 
-@patch('sagemaker.hyperpod.cli.commands.inference.boto3')
+@patch("sagemaker.hyperpod.cli.commands.inference.boto3")
 def test_custom_invoke_invalid_json(mock_boto3):
     runner = CliRunner()
-    result = runner.invoke(custom_invoke, ['--endpoint-name', 'ep', '--body', 'bad'])
+    result = runner.invoke(custom_invoke, ["--endpoint-name", "ep", "--body", "bad"])
     assert result.exit_code != 0
-    assert 'must be valid JSON' in result.output
+    assert "must be valid JSON" in result.output
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_list(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
     inst.list.return_value = [Mock(metadata=Mock(model_dump=lambda: {"name": "e"}))]
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_list, ['--namespace', 'ns'])
+    result = runner.invoke(custom_list, ["--namespace", "ns"])
     assert result.exit_code == 0
-    inst.list.assert_called_once_with('ns')
+    inst.list.assert_called_once_with("ns")
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_describe(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
     inst.get.return_value = Mock(model_dump=lambda: {"name": "e"})
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_describe, ['--name', 'n', '--namespace', 'ns'])
+    result = runner.invoke(custom_describe, ["--name", "n", "--namespace", "ns"])
     assert result.exit_code == 0
-    inst.get.assert_called_once_with('n', 'ns')
+    inst.get.assert_called_once_with("n", "ns")
 
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_delete(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock()
@@ -267,81 +311,190 @@ def test_custom_delete(mock_hp, mock_namespace_exists):
     inst.get.return_value = ep
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_delete, ['--name', 'n', '--namespace', 'ns'])
+    result = runner.invoke(custom_delete, ["--name", "n", "--namespace", "ns"])
     assert result.exit_code == 0
     ep.delete.assert_called_once()
 
 
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_get_operator_logs(mock_hp):
-    inst = Mock(get_operator_logs=Mock(return_value='ol'))
+    inst = Mock(get_operator_logs=Mock(return_value="ol"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_get_operator_logs, ['--since-hours', '2'])
+    result = runner.invoke(custom_get_operator_logs, ["--since-hours", "2"])
     assert result.exit_code == 0
-    assert 'ol' in result.output
+    assert "ol" in result.output
 
 
 # --------- Default Namespace Tests ---------
 
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_list_default_namespace(mock_hp):
     inst = Mock(list=Mock(return_value=[]))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
     result = runner.invoke(js_list, [])
     assert result.exit_code == 0
-    inst.list.assert_called_once_with('default')
+    inst.list.assert_called_once_with("default")
 
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_list_default_namespace(mock_hp):
     inst = Mock(list=Mock(return_value=[]))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
     result = runner.invoke(custom_list, [])
     assert result.exit_code == 0
-    inst.list.assert_called_once_with('default')
+    inst.list.assert_called_once_with("default")
+
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_list_pods(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock(list_pods=Mock(return_value="pods"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_list_pods, ['--namespace', 'ns', '--endpoint-name', 'js-endpoint'])
+    result = runner.invoke(
+        js_list_pods, ["--namespace", "ns", "--endpoint-name", "js-endpoint"]
+    )
     assert result.exit_code == 0
-    assert 'pods' in result.output
+    assert "pods" in result.output
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_list_pods(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock(list_pods=Mock(return_value="pods"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_list_pods, ['--namespace', 'ns', '--endpoint-name', 'custom-endpoint'])
+    result = runner.invoke(
+        custom_list_pods, ["--namespace", "ns", "--endpoint-name", "custom-endpoint"]
+    )
     assert result.exit_code == 0
-    assert 'pods' in result.output
+    assert "pods" in result.output
+
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint')
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPJumpStartEndpoint")
 def test_js_get_logs(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
     inst = Mock(get_logs=Mock(return_value="logs"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(js_get_logs, ['--pod-name', 'p', '--namespace', 'ns'])
+    result = runner.invoke(js_get_logs, ["--pod-name", "p", "--namespace", "ns"])
     assert result.exit_code == 0
-    assert 'logs' in result.output
+    assert "logs" in result.output
 
-@patch('sagemaker.hyperpod.common.cli_decorators._namespace_exists')
-@patch('sagemaker.hyperpod.cli.commands.inference.HPEndpoint')
+
+@patch("sagemaker.hyperpod.common.cli_decorators._namespace_exists")
+@patch("sagemaker.hyperpod.cli.commands.inference.HPEndpoint")
 def test_custom_get_logs(mock_hp, mock_namespace_exists):
     mock_namespace_exists.return_value = True
-    inst = Mock(get_logs=Mock(return_value='l'))
+    inst = Mock(get_logs=Mock(return_value="l"))
     mock_hp.model_construct.return_value = inst
     runner = CliRunner()
-    result = runner.invoke(custom_get_logs, ['--pod-name', 'p', '--namespace', 'ns'])
+    result = runner.invoke(custom_get_logs, ["--pod-name", "p", "--namespace", "ns"])
     assert result.exit_code == 0
-    assert 'l' in result.output
+    assert "l" in result.output
+
+
+@patch("sys.argv", ["pytest", "--version", "1.1"])
+def test_custom_create_with_intelligent_routing_and_kv_cache():
+    """Test custom_create with intelligent routing and KV cache options."""
+
+    # Patch BEFORE reloading the module
+    with patch(
+        "sagemaker.hyperpod.cli.inference_utils.load_schema_for_version"
+    ) as mock_load_schema, patch(
+        "sagemaker.hyperpod.cli.commands.inference.HPEndpoint"
+    ) as mock_endpoint_class:
+        # Set up the schema mock first
+        mock_load_schema.return_value = {
+            "properties": {
+                "instance_type": {"type": "string"},
+                "model_name": {"type": "string"},
+                "model_source_type": {"type": "string", "enum": ["s3", "fsx"]},
+                "s3_bucket_name": {"type": "string"},
+                "s3_region": {"type": "string"},
+                "image_uri": {"type": "string"},
+                "container_port": {"type": "integer"},
+                "model_volume_mount_name": {"type": "string"},
+                "intelligent_routing_enabled": {"type": "boolean"},
+                "routing_strategy": {"type": "string"},
+                "enable_l1_cache": {"type": "boolean"},
+                "enable_l2_cache": {"type": "boolean"},
+                "l2_cache_backend": {"type": "string"},
+                "l2_cache_local_url": {"type": "string"},
+            },
+            "required": [
+                "instance_type",
+                "model_name",
+                "model_source_type",
+                "s3_bucket_name",
+                "s3_region",
+                "image_uri",
+                "container_port",
+                "model_volume_mount_name",
+            ],
+        }
+
+        # Set up the registry mock
+        mock_model_class = Mock()
+        mock_model_instance = Mock()
+        domain_obj = Mock()
+        domain_obj.create = Mock()
+        mock_model_instance.to_domain.return_value = domain_obj
+        mock_model_class.return_value = mock_model_instance
+        mock_endpoint_class.model_construct.return_value = domain_obj
+
+        with patch.object(creg, "SCHEMA_REGISTRY", new={"1.1": mock_model_class}):
+            # NOW reload the module with all patches in place
+            if "sagemaker.hyperpod.cli.commands.inference" in sys.modules:
+                importlib.reload(
+                    sys.modules["sagemaker.hyperpod.cli.commands.inference"]
+                )
+
+            from sagemaker.hyperpod.cli.commands.inference import custom_create
+
+            runner = CliRunner()
+            result = runner.invoke(
+                custom_create,
+                [
+                    "--version",
+                    "1.1",
+                    "--instance-type",
+                    "ml.g5.xlarge",
+                    "--model-name",
+                    "test-model",
+                    "--model-source-type",
+                    "s3",
+                    "--s3-bucket-name",
+                    "test-bucket",
+                    "--s3-region",
+                    "us-west-2",
+                    "--image-uri",
+                    "test-image:latest",
+                    "--container-port",
+                    "8080",
+                    "--model-volume-mount-name",
+                    "model-volume",
+                    "--intelligent-routing-enabled",
+                    "true",
+                    "--routing-strategy",
+                    "prefixaware",
+                    "--enable-l1-cache",
+                    "true",
+                    "--enable-l2-cache",
+                    "true",
+                    "--l2-cache-backend",
+                    "redis/sagemaker",
+                    "--l2-cache-local-url",
+                    "redis://redis.redis-system.svc.cluster.local:6379",
+                ],
+            )
+
+            assert result.exit_code == 0, result.output
+            domain_obj.create.assert_called_once_with(debug=False)
diff --git a/test/unit_tests/inference/test_hp_endpoint.py b/test/unit_tests/inference/test_hp_endpoint.py
index 74bf6b7c..10a69a72 100644
--- a/test/unit_tests/inference/test_hp_endpoint.py
+++ b/test/unit_tests/inference/test_hp_endpoint.py
@@ -3,8 +3,15 @@
 from sagemaker.hyperpod.inference.hp_endpoint import HPEndpoint
 from sagemaker.hyperpod.inference.config.hp_endpoint_config import (
     CloudWatchTrigger,
+    CloudWatchTriggerList,
+    PrometheusTrigger,
+    PrometheusTriggerList,
     Dimensions,
     AutoScalingSpec,
+    IntelligentRoutingSpec,
+    KvCacheSpec,
+    L2CacheSpec,
+    LoadBalancer,
     Metrics,
     S3Storage,
     ModelSourceConfig,
@@ -83,6 +90,30 @@ def setUp(self):
         # Create metrics
         metrics = Metrics(enabled=True)
 
+        # Create intelligent routing spec
+        intelligent_routing_spec = IntelligentRoutingSpec(
+            enabled=True,
+            routing_strategy="prefixaware",
+            auto_scaling_spec=auto_scaling_spec
+        )
+
+        # Create KV cache spec
+        l2_cache_spec = L2CacheSpec(
+            l2_cache_backend="redis",
+            l2_cache_local_url="redis://localhost:6379"
+        )
+        kv_cache_spec = KvCacheSpec(
+            enable_l1_cache=True,
+            enable_l2_cache=True,
+            l2_cache_spec=l2_cache_spec
+        )
+
+        # Create load balancer
+        load_balancer = LoadBalancer(
+            health_check_path="/health",
+            routing_algorithm="least_outstanding_requests"
+        )
+
         self.endpoint = HPEndpoint(
             endpoint_name="s3-test-endpoint-name",
             instance_type="ml.g5.xlarge",
@@ -91,6 +122,9 @@ def setUp(self):
             model_source_config=model_source_config,
             worker=worker,
             auto_scaling_spec=auto_scaling_spec,
+            intelligent_routing_spec=intelligent_routing_spec,
+            kv_cache_spec=kv_cache_spec,
+            load_balancer=load_balancer,
             metrics=metrics,
         )