diff --git a/charts/llm-d-modelservice/templates/epp-deployment.yaml b/charts/llm-d-modelservice/templates/epp-deployment.yaml index f7c50114..acb09ae2 100644 --- a/charts/llm-d-modelservice/templates/epp-deployment.yaml +++ b/charts/llm-d-modelservice/templates/epp-deployment.yaml @@ -21,20 +21,22 @@ spec: imagePullPolicy: Always image: {{ required "routing.epp.image must be specified" .Values.routing.epp.image }} args: - - --poolName + - --pool-name - {{ include "llm-d-modelservice.inferencePoolName" . }} - - --poolNamespace + - --pool-namespace - {{ .Release.Namespace }} - - -v - - "{{ default 4 .Values.routing.epp.debugLevel }}" + - "--pool-group" + - "inference.networking.x-k8s.io" + - --v + - "4" - --zap-encoder - - json - - --grpcPort + - "json" + - --grpc-port - "9002" - - --grpcHealthPort + - --grpc-health-port - "9003" {{- if .Values.routing.epp.pluginsConfigFile }} - - "-configFile" + - "--config-file" - "config/{{ .Values.routing.epp.pluginsConfigFile }}" {{- end}} {{- with .Values.routing.epp.env }} diff --git a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml b/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml index f8cfb260..9fb24cf8 100644 --- a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml +++ b/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml @@ -83,7 +83,15 @@ data: maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefill-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: prefill - type: decode-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: decode - type: max-score-picker - type: pd-profile-handler parameters: diff --git a/charts/llm-d-modelservice/templates/epp-role.yaml b/charts/llm-d-modelservice/templates/epp-role.yaml index a056cbf7..e0f4f22c 100644 --- a/charts/llm-d-modelservice/templates/epp-role.yaml +++ b/charts/llm-d-modelservice/templates/epp-role.yaml @@ -7,7 +7,7 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - inferencemodels + - inferenceobjectives - inferencepools verbs: - get diff --git a/charts/llm-d-modelservice/templates/httproute.yaml b/charts/llm-d-modelservice/templates/httproute.yaml index 9824bd6d..36ff2b53 100644 --- a/charts/llm-d-modelservice/templates/httproute.yaml +++ b/charts/llm-d-modelservice/templates/httproute.yaml @@ -25,7 +25,6 @@ spec: - group: {{ .Values.routing.inferencePool.apiGroup }} kind: InferencePool name: {{ include "llm-d-modelservice.inferencePoolName" . }} - port: {{ .Values.routing.servicePort }} weight: 1 {{- if .Values.routing.httpRoute.timeouts}} timeouts: diff --git a/charts/llm-d-modelservice/values.yaml b/charts/llm-d-modelservice/values.yaml index e622b2d4..645424e5 100644 --- a/charts/llm-d-modelservice/values.yaml +++ b/charts/llm-d-modelservice/values.yaml @@ -169,9 +169,9 @@ routing: # Allow people to opt out of timeouts by unsetting the default value. # They are set to 0s which in most situations defaults to the providers max timeout. - timeouts: - backendRequest: 0s - request: 0s + # timeouts: + # backendRequest: 0s + # request: 0s # @schema # additionalProperties: true diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml index c84d1db0..f60e4b91 100644 --- a/examples/output-cpu.yaml +++ b/examples/output-cpu.yaml @@ -105,7 +105,15 @@ data: maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefill-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: prefill - type: decode-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: decode - type: max-score-picker - type: pd-profile-handler parameters: @@ -134,7 +142,7 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - inferencemodels + - inferenceobjectives - inferencepools verbs: - get @@ -308,19 +316,21 @@ spec: imagePullPolicy: Always image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 args: - - --poolName + - --pool-name - cpu-sim-llm-d-modelservice - - --poolNamespace + - --pool-namespace - default - - -v - - "6" + - "--pool-group" + - "inference.networking.x-k8s.io" + - --v + - "4" - --zap-encoder - - json - - --grpcPort + - "json" + - --grpc-port - "9002" - - --grpcHealthPort + - --grpc-health-port - "9003" - - "-configFile" + - "--config-file" - "config/default-config.yaml" ports: - containerPort: 9002 @@ -448,11 +458,7 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: cpu-sim-llm-d-modelservice - port: 8000 weight: 1 - timeouts: - backendRequest: 0s - request: 0s matches: - headers: - name: x-model-name diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml index d5a303cd..cc299789 100644 --- a/examples/output-pd.yaml +++ b/examples/output-pd.yaml @@ -105,7 +105,15 @@ data: maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefill-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: prefill - type: decode-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: decode - type: max-score-picker - type: pd-profile-handler parameters: @@ -134,7 +142,7 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - inferencemodels + - inferenceobjectives - inferencepools verbs: - get @@ -254,7 +262,7 @@ spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -277,7 +285,7 @@ spec: fieldRef: fieldPath: status.podIP - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_LOGGING_LEVEL value: DEBUG - name: DP_SIZE @@ -292,14 +300,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -328,22 +338,24 @@ spec: containers: - name: epp imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 + image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1 args: - - --poolName + - --pool-name - pd-llm-d-modelservice - - --poolNamespace + - --pool-namespace - default - - -v + - "--pool-group" + - "inference.networking.x-k8s.io" + - --v - "4" - --zap-encoder - - json - - --grpcPort + - "json" + - --grpc-port - "9002" - - --grpcHealthPort + - --grpc-health-port - "9003" - - "-configFile" - - "config/default-config.yaml" + - "--config-file" + - "config/default-pd-config.yaml" ports: - containerPort: 9002 name: grpc @@ -417,7 +429,7 @@ spec: containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -436,7 +448,7 @@ spec: - name: UCX_TLS value: cuda_ipc,cuda_copy,tcp - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: fieldRef: @@ -455,14 +467,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -470,20 +484,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice - name: pd-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pd-llm-d-modelservice ---- # Source: llm-d-modelservice/templates/inferencepool.yaml apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool @@ -524,13 +524,9 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: pd-llm-d-modelservice - port: 8000 weight: 1 - timeouts: - backendRequest: 0s - request: 0s matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' + - headers: [] + path: + type: PathPrefix + value: / diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml index 0dd226f1..4476525f 100644 --- a/examples/output-pvc-hf.yaml +++ b/examples/output-pvc-hf.yaml @@ -105,7 +105,15 @@ data: maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefill-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: prefill - type: decode-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: decode - type: max-score-picker - type: pd-profile-handler parameters: @@ -134,7 +142,7 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - inferencemodels + - inferenceobjectives - inferencepools verbs: - get @@ -254,7 +262,7 @@ spec: readOnly: true containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -277,7 +285,7 @@ spec: fieldRef: fieldPath: status.podIP - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_LOGGING_LEVEL value: DEBUG - name: DP_SIZE @@ -292,14 +300,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -328,22 +338,24 @@ spec: containers: - name: epp imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 + image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1 args: - - --poolName + - --pool-name - pvc-hf-llm-d-modelservice - - --poolNamespace + - --pool-namespace - default - - -v + - "--pool-group" + - "inference.networking.x-k8s.io" + - --v - "4" - --zap-encoder - - json - - --grpcPort + - "json" + - --grpc-port - "9002" - - --grpcHealthPort + - --grpc-health-port - "9003" - - "-configFile" - - "config/default-config.yaml" + - "--config-file" + - "config/default-pd-config.yaml" ports: - containerPort: 9002 name: grpc @@ -417,7 +429,7 @@ spec: readOnly: true containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -436,7 +448,7 @@ spec: - name: UCX_TLS value: cuda_ipc,cuda_copy,tcp - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: fieldRef: @@ -455,14 +467,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -470,20 +484,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice - name: pvc-hf-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pvc-hf-llm-d-modelservice ---- # Source: llm-d-modelservice/templates/inferencepool.yaml apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool @@ -524,13 +524,9 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: pvc-hf-llm-d-modelservice - port: 8000 weight: 1 - timeouts: - backendRequest: 0s - request: 0s matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' + - headers: [] + path: + type: PathPrefix + value: / diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml index cc63db83..666ad033 100644 --- a/examples/output-pvc.yaml +++ b/examples/output-pvc.yaml @@ -105,7 +105,15 @@ data: maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: prefill-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: prefill - type: decode-filter + parameters: + selector: + matchLabels: + llm-d.ai/role: decode - type: max-score-picker - type: pd-profile-handler parameters: @@ -134,7 +142,7 @@ rules: - apiGroups: - inference.networking.x-k8s.io resources: - - inferencemodels + - inferenceobjectives - inferencepools verbs: - get @@ -254,7 +262,7 @@ spec: readOnly: true containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -277,7 +285,7 @@ spec: fieldRef: fieldPath: status.podIP - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_LOGGING_LEVEL value: DEBUG - name: DP_SIZE @@ -290,14 +298,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -326,22 +336,24 @@ spec: containers: - name: epp imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 + image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1 args: - - --poolName + - --pool-name - pvc-llm-d-modelservice - - --poolNamespace + - --pool-namespace - default - - -v + - "--pool-group" + - "inference.networking.x-k8s.io" + - --v - "4" - --zap-encoder - - json - - --grpcPort + - "json" + - --grpc-port - "9002" - - --grpcHealthPort + - --grpc-health-port - "9003" - - "-configFile" - - "config/default-config.yaml" + - "--config-file" + - "config/default-pd-config.yaml" ports: - containerPort: 9002 name: grpc @@ -415,7 +427,7 @@ spec: readOnly: true containers: - name: vllm - image: ghcr.io/llm-d/llm-d:v0.2.0 + image: ghcr.io/llm-d/llm-d-cuda:v0.3.0 command: ["vllm", "serve"] args: @@ -434,7 +446,7 @@ spec: - name: UCX_TLS value: cuda_ipc,cuda_copy,tcp - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: fieldRef: @@ -451,14 +463,16 @@ spec: protocol: TCP - containerPort: 5557 protocol: TCP + - containerPort: 5600 + protocol: TCP resources: limits: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" @@ -466,20 +480,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice - name: pvc-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pvc-llm-d-modelservice ---- # Source: llm-d-modelservice/templates/inferencepool.yaml apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool @@ -520,13 +520,9 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: pvc-llm-d-modelservice - port: 8000 weight: 1 - timeouts: - backendRequest: 0s - request: 0s matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' + - headers: [] + path: + type: PathPrefix + value: / diff --git a/examples/values-pd.yaml b/examples/values-pd.yaml index fe0e849f..403880df 100644 --- a/examples/values-pd.yaml +++ b/examples/values-pd.yaml @@ -26,22 +26,25 @@ routing: inferencePool: create: true + apiGroup: "inference.networking.x-k8s.io" # required for certain gateways (e.g. Kgateway) but not others (Istio) # creating this so that it works for all gateways inferenceModel: - create: true + create: false httpRoute: create: true - matches: - - headers: - - name: x-model-name - type: Exact - value: "{{ .Values.modelArtifacts.name }}" + # matches: + # - headers: + # - name: x-model-name + # type: Exact + # value: "{{ .Values.modelArtifacts.name }}" epp: create: true + pluginsConfigFile: default-pd-config.yaml + image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1 # Inherit all from base chart values.yaml # env: @@ -51,7 +54,7 @@ decode: replicas: 1 containers: - name: "vllm" - image: "ghcr.io/llm-d/llm-d:v0.2.0" + image: "ghcr.io/llm-d/llm-d-cuda:v0.3.0" modelCommand: vllmServe args: - "--enforce-eager" @@ -67,21 +70,23 @@ decode: fieldRef: fieldPath: status.podIP - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_LOGGING_LEVEL value: DEBUG ports: - containerPort: 8200 # from routing.proxy.targetPort protocol: TCP - - containerPort: 5557 # NIXL side channel + - containerPort: 5557 # NIXL channel + protocol: TCP + - containerPort: 5600 # NIXL side channel protocol: TCP resources: limits: memory: 16Gi - cpu: "16" + cpu: "6" nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" mountModelVolume: true @@ -92,7 +97,7 @@ prefill: replicas: 1 containers: - name: "vllm" - image: "ghcr.io/llm-d/llm-d:v0.2.0" + image: "ghcr.io/llm-d/llm-d-cuda:v0.3.0" modelCommand: vllmServe args: - "--enforce-eager" @@ -104,7 +109,7 @@ prefill: - name: UCX_TLS value: "cuda_ipc,cuda_copy,tcp" - name: VLLM_NIXL_SIDE_CHANNEL_PORT - value: "5557" + value: "5600" - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: fieldRef: @@ -114,15 +119,17 @@ prefill: ports: - containerPort: 8000 # from routing.servicePort protocol: TCP - - containerPort: 5557 # NIXL side channel + - containerPort: 5557 # NIXL channel + protocol: TCP + - containerPort: 5600 # NIXL side channel protocol: TCP resources: limits: memory: 16Gi - cpu: "16" + cpu: "6" nvidia.com/gpu: "1" requests: - cpu: "16" + cpu: "6" memory: 16Gi nvidia.com/gpu: "1" mountModelVolume: true