diff --git a/charts/llm-d-modelservice/templates/epp-deployment.yaml b/charts/llm-d-modelservice/templates/epp-deployment.yaml
index f7c50114..acb09ae2 100644
--- a/charts/llm-d-modelservice/templates/epp-deployment.yaml
+++ b/charts/llm-d-modelservice/templates/epp-deployment.yaml
@@ -21,20 +21,22 @@ spec:
         imagePullPolicy: Always
         image: {{ required "routing.epp.image must be specified" .Values.routing.epp.image }}
         args:
-        - --poolName
+        - --pool-name
         - {{ include "llm-d-modelservice.inferencePoolName" . }}
-        - --poolNamespace
+        - --pool-namespace
         - {{ .Release.Namespace }}
-        - -v
-        - "{{ default 4 .Values.routing.epp.debugLevel }}"
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
+        - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
         {{- if .Values.routing.epp.pluginsConfigFile }}
-        - "-configFile"
+        - "--config-file"
         - "config/{{ .Values.routing.epp.pluginsConfigFile }}"
         {{- end}}
         {{- with .Values.routing.epp.env }}
diff --git a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml b/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml
index f8cfb260..9fb24cf8 100644
--- a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml
+++ b/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml
@@ -83,7 +83,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
diff --git a/charts/llm-d-modelservice/templates/epp-role.yaml b/charts/llm-d-modelservice/templates/epp-role.yaml
index a056cbf7..e0f4f22c 100644
--- a/charts/llm-d-modelservice/templates/epp-role.yaml
+++ b/charts/llm-d-modelservice/templates/epp-role.yaml
@@ -7,7 +7,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
diff --git a/charts/llm-d-modelservice/templates/httproute.yaml b/charts/llm-d-modelservice/templates/httproute.yaml
index 9824bd6d..36ff2b53 100644
--- a/charts/llm-d-modelservice/templates/httproute.yaml
+++ b/charts/llm-d-modelservice/templates/httproute.yaml
@@ -25,7 +25,6 @@ spec:
       - group: {{ .Values.routing.inferencePool.apiGroup }}
         kind: InferencePool
         name: {{ include "llm-d-modelservice.inferencePoolName" . }}
-        port: {{ .Values.routing.servicePort  }}
         weight: 1
       {{- if .Values.routing.httpRoute.timeouts}}
       timeouts:
diff --git a/charts/llm-d-modelservice/values.yaml b/charts/llm-d-modelservice/values.yaml
index e622b2d4..645424e5 100644
--- a/charts/llm-d-modelservice/values.yaml
+++ b/charts/llm-d-modelservice/values.yaml
@@ -169,9 +169,9 @@ routing:
 
     # Allow people to opt out of timeouts by unsetting the default value.
     # They are set to 0s which in most situations defaults to the providers max timeout.
-    timeouts:
-      backendRequest: 0s
-      request: 0s
+    # timeouts:
+    #   backendRequest: 0s
+    #   request: 0s
 
   # @schema
   # additionalProperties: true
diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml
index c84d1db0..f60e4b91 100644
--- a/examples/output-cpu.yaml
+++ b/examples/output-cpu.yaml
@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -308,19 +316,21 @@ spec:
         imagePullPolicy: Always
         image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
         args:
-        - --poolName
+        - --pool-name
         - cpu-sim-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
-        - "6"
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
+        - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
+        - "--config-file"
         - "config/default-config.yaml"
         ports:
         - containerPort: 9002
@@ -448,11 +458,7 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: cpu-sim-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
       - headers:
         - name: x-model-name
diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml
index d5a303cd..cc299789 100644
--- a/examples/output-pd.yaml
+++ b/examples/output-pd.yaml
@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -254,7 +262,7 @@ spec:
         
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -277,7 +285,7 @@ spec:
               fieldRef:
                 fieldPath: status.podIP
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_LOGGING_LEVEL
             value: DEBUG
           - name: DP_SIZE
@@ -292,14 +300,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -328,22 +338,24 @@ spec:
       containers:
       - name: epp
         imagePullPolicy: Always
-        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
         args:
-        - --poolName
+        - --pool-name
         - pd-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
         - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
-        - "config/default-config.yaml"
+        - "--config-file"
+        - "config/default-pd-config.yaml"
         ports:
         - containerPort: 9002
           name: grpc
@@ -417,7 +429,7 @@ spec:
         
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -436,7 +448,7 @@ spec:
           - name: UCX_TLS
             value: cuda_ipc,cuda_copy,tcp
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_NIXL_SIDE_CHANNEL_HOST
             valueFrom:
               fieldRef:
@@ -455,14 +467,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -470,20 +484,6 @@ spec:
             - name: model-storage
               mountPath: /model-cache
 ---
-# Source: llm-d-modelservice/templates/inferencemodel.yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  labels:
-    llm-d.ai/inferenceServing: "true"
-    llm-d.ai/model: pd-llm-d-modelservice
-  name: pd-llm-d-modelservice
-spec:
-  criticality: Critical
-  modelName: facebook/opt-125m
-  poolRef:
-    name: pd-llm-d-modelservice
----
 # Source: llm-d-modelservice/templates/inferencepool.yaml
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
@@ -524,13 +524,9 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: pd-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
-      - headers:
-        - name: x-model-name
-          type: Exact
-          value: 'facebook/opt-125m'
+      - headers: []
+        path:
+          type: PathPrefix
+          value: /
diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml
index 0dd226f1..4476525f 100644
--- a/examples/output-pvc-hf.yaml
+++ b/examples/output-pvc-hf.yaml
@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -254,7 +262,7 @@ spec:
             readOnly: true
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -277,7 +285,7 @@ spec:
               fieldRef:
                 fieldPath: status.podIP
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_LOGGING_LEVEL
             value: DEBUG
           - name: DP_SIZE
@@ -292,14 +300,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -328,22 +338,24 @@ spec:
       containers:
       - name: epp
         imagePullPolicy: Always
-        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
         args:
-        - --poolName
+        - --pool-name
         - pvc-hf-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
         - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
-        - "config/default-config.yaml"
+        - "--config-file"
+        - "config/default-pd-config.yaml"
         ports:
         - containerPort: 9002
           name: grpc
@@ -417,7 +429,7 @@ spec:
             readOnly: true
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -436,7 +448,7 @@ spec:
           - name: UCX_TLS
             value: cuda_ipc,cuda_copy,tcp
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_NIXL_SIDE_CHANNEL_HOST
             valueFrom:
               fieldRef:
@@ -455,14 +467,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -470,20 +484,6 @@ spec:
             - name: model-storage
               mountPath: /model-cache
 ---
-# Source: llm-d-modelservice/templates/inferencemodel.yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  labels:
-    llm-d.ai/inferenceServing: "true"
-    llm-d.ai/model: pvc-hf-llm-d-modelservice
-  name: pvc-hf-llm-d-modelservice
-spec:
-  criticality: Critical
-  modelName: facebook/opt-125m
-  poolRef:
-    name: pvc-hf-llm-d-modelservice
----
 # Source: llm-d-modelservice/templates/inferencepool.yaml
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
@@ -524,13 +524,9 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: pvc-hf-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
-      - headers:
-        - name: x-model-name
-          type: Exact
-          value: 'facebook/opt-125m'
+      - headers: []
+        path:
+          type: PathPrefix
+          value: /
diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml
index cc63db83..666ad033 100644
--- a/examples/output-pvc.yaml
+++ b/examples/output-pvc.yaml
@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -254,7 +262,7 @@ spec:
             readOnly: true
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -277,7 +285,7 @@ spec:
               fieldRef:
                 fieldPath: status.podIP
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_LOGGING_LEVEL
             value: DEBUG
           - name: DP_SIZE
@@ -290,14 +298,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -326,22 +336,24 @@ spec:
       containers:
       - name: epp
         imagePullPolicy: Always
-        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
         args:
-        - --poolName
+        - --pool-name
         - pvc-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
         - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
-        - "config/default-config.yaml"
+        - "--config-file"
+        - "config/default-pd-config.yaml"
         ports:
         - containerPort: 9002
           name: grpc
@@ -415,7 +427,7 @@ spec:
             readOnly: true
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
           
           command: ["vllm", "serve"]
           args:
@@ -434,7 +446,7 @@ spec:
           - name: UCX_TLS
             value: cuda_ipc,cuda_copy,tcp
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_NIXL_SIDE_CHANNEL_HOST
             valueFrom:
               fieldRef:
@@ -451,14 +463,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
           
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
           
@@ -466,20 +480,6 @@ spec:
             - name: model-storage
               mountPath: /model-cache
 ---
-# Source: llm-d-modelservice/templates/inferencemodel.yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  labels:
-    llm-d.ai/inferenceServing: "true"
-    llm-d.ai/model: pvc-llm-d-modelservice
-  name: pvc-llm-d-modelservice
-spec:
-  criticality: Critical
-  modelName: facebook/opt-125m
-  poolRef:
-    name: pvc-llm-d-modelservice
----
 # Source: llm-d-modelservice/templates/inferencepool.yaml
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
@@ -520,13 +520,9 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: pvc-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
-      - headers:
-        - name: x-model-name
-          type: Exact
-          value: 'facebook/opt-125m'
+      - headers: []
+        path:
+          type: PathPrefix
+          value: /
diff --git a/examples/values-pd.yaml b/examples/values-pd.yaml
index fe0e849f..403880df 100644
--- a/examples/values-pd.yaml
+++ b/examples/values-pd.yaml
@@ -26,22 +26,25 @@ routing:
 
   inferencePool:
     create: true
+    apiGroup: "inference.networking.x-k8s.io"
 
   # required for certain gateways (e.g. Kgateway) but not others (Istio)
   # creating this so that it works for all gateways
   inferenceModel:
-    create: true
+    create: false
 
   httpRoute:
     create: true
-    matches:
-    - headers:
-      - name: x-model-name
-        type: Exact
-        value: "{{ .Values.modelArtifacts.name }}"
+    # matches:
+    # - headers:
+    #   - name: x-model-name
+    #     type: Exact
+    #     value: "{{ .Values.modelArtifacts.name }}"
 
   epp:
     create: true
+    pluginsConfigFile: default-pd-config.yaml
+    image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
     # Inherit all from base chart values.yaml
     # env:
 
@@ -51,7 +54,7 @@ decode:
   replicas: 1
   containers:
   - name: "vllm"
-    image: "ghcr.io/llm-d/llm-d:v0.2.0"
+    image: "ghcr.io/llm-d/llm-d-cuda:v0.3.0"
     modelCommand: vllmServe
     args:
       - "--enforce-eager"
@@ -67,21 +70,23 @@ decode:
           fieldRef:
             fieldPath: status.podIP
       - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-        value: "5557"
+        value: "5600"
       - name: VLLM_LOGGING_LEVEL
         value: DEBUG
     ports:
       - containerPort: 8200  # from routing.proxy.targetPort
         protocol: TCP
-      - containerPort: 5557  # NIXL side channel
+      - containerPort: 5557  # NIXL channel
+        protocol: TCP
+      - containerPort: 5600  # NIXL side channel
         protocol: TCP
     resources:
       limits:
         memory: 16Gi
-        cpu: "16"
+        cpu: "6"
         nvidia.com/gpu: "1"
       requests:
-        cpu: "16"
+        cpu: "6"
         memory: 16Gi
         nvidia.com/gpu: "1"
     mountModelVolume: true
@@ -92,7 +97,7 @@ prefill:
   replicas: 1
   containers:
   - name: "vllm"
-    image: "ghcr.io/llm-d/llm-d:v0.2.0"
+    image: "ghcr.io/llm-d/llm-d-cuda:v0.3.0"
     modelCommand: vllmServe
     args:
       - "--enforce-eager"
@@ -104,7 +109,7 @@ prefill:
       - name: UCX_TLS
         value: "cuda_ipc,cuda_copy,tcp"
       - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-        value: "5557"
+        value: "5600"
       - name: VLLM_NIXL_SIDE_CHANNEL_HOST
         valueFrom:
           fieldRef:
@@ -114,15 +119,17 @@ prefill:
     ports:
       - containerPort: 8000  # from routing.servicePort
         protocol: TCP
-      - containerPort: 5557  # NIXL side channel
+      - containerPort: 5557  # NIXL channel
+        protocol: TCP
+      - containerPort: 5600  # NIXL side channel
         protocol: TCP
     resources:
       limits:
         memory: 16Gi
-        cpu: "16"
+        cpu: "6"
         nvidia.com/gpu: "1"
       requests:
-        cpu: "16"
+        cpu: "6"
         memory: 16Gi
         nvidia.com/gpu: "1"
     mountModelVolume: true