llm-d-incubation · elieserr · Oct 15, 2025 · Oct 15, 2025 · Oct 20, 2025 · Oct 20, 2025
@@ -21,20 +21,22 @@ spec:
         imagePullPolicy: Always
         image: {{ required "routing.epp.image must be specified" .Values.routing.epp.image }}
         args:
-        - --poolName
+        - --pool-name
         - {{ include "llm-d-modelservice.inferencePoolName" . }}
-        - --poolNamespace
+        - --pool-namespace
         - {{ .Release.Namespace }}
-        - -v
-        - "{{ default 4 .Values.routing.epp.debugLevel }}"
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
+        - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
         {{- if .Values.routing.epp.pluginsConfigFile }}
-        - "-configFile"
+        - "--config-file"
         - "config/{{ .Values.routing.epp.pluginsConfigFile }}"
         {{- end}}
         {{- with .Values.routing.epp.env }}

@@ -83,7 +83,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:

@@ -7,7 +7,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get

@@ -25,7 +25,6 @@ spec:
       - group: {{ .Values.routing.inferencePool.apiGroup }}
         kind: InferencePool
         name: {{ include "llm-d-modelservice.inferencePoolName" . }}
-        port: {{ .Values.routing.servicePort  }}
         weight: 1
       {{- if .Values.routing.httpRoute.timeouts}}
       timeouts:

@@ -169,9 +169,9 @@ routing:
 
     # Allow people to opt out of timeouts by unsetting the default value.
     # They are set to 0s which in most situations defaults to the providers max timeout.
-    timeouts:
-      backendRequest: 0s
-      request: 0s
+    # timeouts:
+    #   backendRequest: 0s
+    #   request: 0s
 
   # @schema
   # additionalProperties: true

@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -308,19 +316,21 @@ spec:
         imagePullPolicy: Always
         image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
         args:
-        - --poolName
+        - --pool-name
         - cpu-sim-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
-        - "6"
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
+        - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
+        - "--config-file"
         - "config/default-config.yaml"
         ports:
         - containerPort: 9002
@@ -448,11 +458,7 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: cpu-sim-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
       - headers:
         - name: x-model-name

@@ -105,7 +105,15 @@ data:
         maxPrefixBlocksToMatch: 256
         lruCapacityPerServer: 31250
     - type: prefill-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: prefill
     - type: decode-filter
+      parameters:
+        selector:
+          matchLabels:
+            llm-d.ai/role: decode
     - type: max-score-picker
     - type: pd-profile-handler
       parameters:
@@ -134,7 +142,7 @@ rules:
 - apiGroups:
   - inference.networking.x-k8s.io
   resources:
-  - inferencemodels
+  - inferenceobjectives
   - inferencepools
   verbs:
   - get
@@ -254,7 +262,7 @@ spec:
 
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
 
           command: ["vllm", "serve"]
           args:
@@ -277,7 +285,7 @@ spec:
               fieldRef:
                 fieldPath: status.podIP
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_LOGGING_LEVEL
             value: DEBUG
           - name: DP_SIZE
@@ -292,14 +300,16 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
 
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
 
@@ -328,22 +338,24 @@ spec:
       containers:
       - name: epp
         imagePullPolicy: Always
-        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
+        image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
         args:
-        - --poolName
+        - --pool-name
         - pd-llm-d-modelservice
-        - --poolNamespace
+        - --pool-namespace
         - default
-        - -v
+        - "--pool-group"
+        - "inference.networking.x-k8s.io"
+        - --v
         - "4"
         - --zap-encoder
-        - json
-        - --grpcPort
+        - "json"
+        - --grpc-port
         - "9002"
-        - --grpcHealthPort
+        - --grpc-health-port
         - "9003"
-        - "-configFile"
-        - "config/default-config.yaml"
+        - "--config-file"
+        - "config/default-pd-config.yaml"
         ports:
         - containerPort: 9002
           name: grpc
@@ -417,7 +429,7 @@ spec:
 
       containers:
         - name: vllm
-          image: ghcr.io/llm-d/llm-d:v0.2.0
+          image: ghcr.io/llm-d/llm-d-cuda:v0.3.0
 
           command: ["vllm", "serve"]
           args:
@@ -436,7 +448,7 @@ spec:
           - name: UCX_TLS
             value: cuda_ipc,cuda_copy,tcp
           - name: VLLM_NIXL_SIDE_CHANNEL_PORT
-            value: "5557"
+            value: "5600"
           - name: VLLM_NIXL_SIDE_CHANNEL_HOST
             valueFrom:
               fieldRef:
@@ -455,35 +467,23 @@ spec:
             protocol: TCP
           - containerPort: 5557
             protocol: TCP
+          - containerPort: 5600
+            protocol: TCP
 
           resources:
             limits:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
             requests:
-              cpu: "16"
+              cpu: "6"
               memory: 16Gi
               nvidia.com/gpu: "1"
 
           volumeMounts:
             - name: model-storage
               mountPath: /model-cache
 ---
-# Source: llm-d-modelservice/templates/inferencemodel.yaml
-apiVersion: inference.networking.x-k8s.io/v1alpha2
-kind: InferenceModel
-metadata:
-  labels:
-    llm-d.ai/inferenceServing: "true"
-    llm-d.ai/model: pd-llm-d-modelservice
-  name: pd-llm-d-modelservice
-spec:
-  criticality: Critical
-  modelName: facebook/opt-125m
-  poolRef:
-    name: pd-llm-d-modelservice
----
 # Source: llm-d-modelservice/templates/inferencepool.yaml
 apiVersion: inference.networking.x-k8s.io/v1alpha2
 kind: InferencePool
@@ -524,13 +524,9 @@ spec:
       - group: inference.networking.x-k8s.io
         kind: InferencePool
         name: pd-llm-d-modelservice
-        port: 8000
         weight: 1
-      timeouts:
-        backendRequest: 0s
-        request: 0s
       matches:
-      - headers:
-        - name: x-model-name
-          type: Exact
-          value: 'facebook/opt-125m'
+      - headers: []
+        path:
+          type: PathPrefix
+          value: /