From e10e8f78fb9c0795102fb57548594f55fc344574 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 21 Oct 2025 10:33:06 -0400 Subject: [PATCH 1/4] remove gaie from chart Signed-off-by: Michael Kalantar --- README.md | 33 +- .../llm-d-modelservice/templates/_helpers.tpl | 51 +-- .../templates/epp-deployment.yaml | 90 ---- .../templates/epp-plugin-configmap.yaml | 108 ----- .../templates/epp-role.yaml | 44 -- .../templates/epp-rolebinding.yaml | 17 - .../llm-d-modelservice/templates/epp-sa.yaml | 8 - .../templates/epp-service.yaml | 21 - .../templates/httproute.yaml | 51 --- .../templates/inferencemodel.yaml | 15 - .../templates/inferencepool.yaml | 20 - charts/llm-d-modelservice/values.schema.json | 414 +----------------- .../values.schema.tmpl.json | 277 +----------- charts/llm-d-modelservice/values.yaml | 140 +----- examples/README.md | 146 +++--- examples/output-cpu.yaml | 308 +------------ examples/output-pd.yaml | 340 +------------- examples/output-pvc-hf.yaml | 340 +------------- examples/output-pvc.yaml | 340 +------------- examples/values-cpu.yaml | 17 +- examples/values-pd.yaml | 28 +- examples/values-xpu-pd.yaml | 26 +- examples/values-xpu.yaml | 3 + 23 files changed, 153 insertions(+), 2684 deletions(-) delete mode 100644 charts/llm-d-modelservice/templates/epp-deployment.yaml delete mode 100644 charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml delete mode 100644 charts/llm-d-modelservice/templates/epp-role.yaml delete mode 100644 charts/llm-d-modelservice/templates/epp-rolebinding.yaml delete mode 100644 charts/llm-d-modelservice/templates/epp-sa.yaml delete mode 100644 charts/llm-d-modelservice/templates/epp-service.yaml delete mode 100644 charts/llm-d-modelservice/templates/httproute.yaml delete mode 100644 charts/llm-d-modelservice/templates/inferencemodel.yaml delete mode 100644 charts/llm-d-modelservice/templates/inferencepool.yaml diff --git a/README.md b/README.md index 01cd7a00..5c8054f8 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,20 @@ helm repo update ModelService operates under the assumption that `llm-d-infra` has been installed in a Kubernetes cluster, which installs the required prerequisites and CRDs. Read the [`llm-d` Guides](https://github.com/llm-d/llm-d/blob/main/guides/README.md) for more information. -Note that in order to create HTTPRoute objects last, Helm hooks are used. As a consequence, these objects are not deleted when `helm delete` is executed. They should be manually deleted to avoid unexpected routing problems. +## Routing + +Once a model is deployed, inference requests must be routed to it. To do this, the Kubernetes Gateway API Inference Extension (GAIE) Helm charts can be used. These charts are defined [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/). For example, to create an InferencePool, use the chart oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool. + +### Relationships + +Note that when using the GAIE InferencePool chart together with the Modelservice chart the following relationships will exist: + +- The modelservice field `modelArtifact.routing.servicePort` should match the GAIE field `inferencePool.targetPortNumber` or be an entry in the list `inferencePool.targets` (depending on the apiVersion of InferencePool). +- The modelservice field `modelArtifact.labels` should match the GAIE field, `inferencePool.modelServers.matchLabels`. + +### HTTPRoute + +In addition to deploying the GAIE chart, an `HTTPRoute` is typically required to connect the `Gateway` to the `InferencePool`. Creating an HTTPRoute is not part of either chart. Some examples are provided [here](https://github.com/llm-d-incubation/llm-d-modelservice/blob/main/examples/README.md#httproute) ## Examples @@ -56,24 +69,6 @@ Below are the values you can set. | `routing.proxy.targetPort` | The port the vLLM decode container listens on.
If proxy is present, it will forward request to this port. | string | N/A | | `routing.proxy.debugLevel` | Debug level of the routing proxy | int | 5 | | `routing.proxy.parentRefs[*].name` | The name of the inference gateway | string | N/A | -| `routing.inferencePool.create` | If true, creates an InferencePool object | bool | `true` | -| `routing.inferencePool.extensionRef` | Name of of an epp service to use instead of the default one created by this chart. | string | N/A | -| `routing.inferenceModel.create` | If true, creates an InferenceModel object | bool | `false` | -| `routing.httpRoute.create` | If true, creates an HTTPRoute object | bool | `true` | -| `routing.httpRoute.backendRefs` | Override for HTTPRoute.backendRefs | List | [] | -| `routing.httpRoute.matches` | Override for HTTPRoute.backendRefs[*].matches where backendRefs are created by this chart. | Dict | {} | -| `routing.epp.create` | If true, creates EPP objects | bool | `true` | -| `routing.epp.service.permissions` | Role to be bound to the epp service account in place of the default created by this chart. | string | N/A | -| `routing.epp.service.type` | Type of Service created for the Inference Scheduler (Endpoint Picker) deployment | string | ClusterIP | -| `routing.epp.service.port` | The port the Inference Scheduler listens on | int | 9002 | -| `routing.epp.service.targetPort` | The target port the Inference Scheduler listens on | int | 9002 | -| `routing.epp.service.appProtocol` | The app protocol the Inference Scheduler uses | int | 9002 | -| `routing.epp.image` | Image to be used for the epp container | string | ghcr.io/llm-d/llm-d-inference-scheduler:0.0.4` | -| `routing.epp.replicas` | Number of replicas for the Inference Scheduler pod | int | 1 | -| `routing.epp.debugLevel` | Debug level used to start the Inference Scheduler pod | int | 4 | -| `routing.epp.disableReadinessProbe` | Disable readiness probe creation for the Inference Scheduler pod.
Set this to `true` if you want to debug on Kind. | bool | `false` | -| `routing.epp.disableLivenessProbe` | Disable liveness probe creation for the Inference Scheduler pod.
Set this to `true` if you want to debug on Kind. | bool | `false` | -| `routing.epp.env` | List of environment variables | List | [] | | `decode.create` | If true, creates decode Deployment or LeaderWorkerSet | List | `true` | | `decode.annotations` | Annotations that should be added to the Deployment or LeaderWorkerSet | Dict | {} | | `decode.tolerations` | Tolerations that should be added to the Deployment or LeaderWorkerSet | List | [] | diff --git a/charts/llm-d-modelservice/templates/_helpers.tpl b/charts/llm-d-modelservice/templates/_helpers.tpl index 867cccfe..43a44914 100644 --- a/charts/llm-d-modelservice/templates/_helpers.tpl +++ b/charts/llm-d-modelservice/templates/_helpers.tpl @@ -60,8 +60,7 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Create common shared by prefill and decode deployment/LWS */}} {{- define "llm-d-modelservice.pdlabels" -}} -llm-d.ai/inferenceServing: "true" -llm-d.ai/model: {{ (include "llm-d-modelservice.fullname" .) -}} +{{ .Values.modelArtifacts.labels | toYaml }} {{- end }} {{/* Create labels for the prefill deployment/LWS */}} @@ -212,54 +211,6 @@ resources: {{ include "llm-d-modelservice.fullname" . }} {{- end }} -{{/* EPP service account name */}} -{{- define "llm-d-modelservice.eppServiceAccountName" -}} -{{ include "llm-d-modelservice.eppName" . }} -{{- end }} - -{{/* EPP service name */}} -{{- define "llm-d-modelservice.eppServiceName" -}} -{{ include "llm-d-modelservice.eppName" . }} -{{- end }} - -{{/* EPP role name */}} -{{- define "llm-d-modelservice.eppRoleName" -}} -{{ include "llm-d-modelservice.eppName" . }} -{{- end }} - -{{/* EPP rolebinding name */}} -{{- define "llm-d-modelservice.eppRoleBindingName" -}} -{{ include "llm-d-modelservice.eppName" . }} -{{- end }} - -{{/* EPP Config name */}} -{{- define "llm-d-modelservice.eppConfigName" -}} -{{ include "llm-d-modelservice.eppName" . }} -{{- end }} - -{{/* default inference pool name */}} -{{- define "llm-d-modelservice.inferencePoolName" -}} -{{- if .Values.routing.inferencePool.name -}} -{{- .Values.routing.inferencePool.name }} -{{- else -}} -{{ include "llm-d-modelservice.fullname" . }} -{{- end }} -{{- end }} - -{{/* default inference model name */}} -{{- define "llm-d-modelservice.inferenceModelName" -}} -{{- if .Values.routing.inferenceModel.name -}} -{{- .Values.routing.inferenceModel.name }} -{{- else -}} -{{ include "llm-d-modelservice.fullname" . }} -{{- end -}} -{{- end }} - -{{/* default http route name */}} -{{- define "llm-d-modelservice.httpRouteName" -}} -{{ include "llm-d-modelservice.fullname" . }} -{{- end }} - {{/* Volumes for PD containers based on model artifact prefix Context is .Values.modelArtifacts diff --git a/charts/llm-d-modelservice/templates/epp-deployment.yaml b/charts/llm-d-modelservice/templates/epp-deployment.yaml deleted file mode 100644 index f7c50114..00000000 --- a/charts/llm-d-modelservice/templates/epp-deployment.yaml +++ /dev/null @@ -1,90 +0,0 @@ -{{- if .Values.routing.epp.create -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "llm-d-modelservice.eppName" . }} - labels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} - namespace: {{ .Release.Namespace }} -spec: - replicas: {{ default 1 .Values.routing.epp.replicas }} - selector: - matchLabels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} - template: - metadata: - labels: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} - spec: - containers: - - name: epp - imagePullPolicy: Always - image: {{ required "routing.epp.image must be specified" .Values.routing.epp.image }} - args: - - --poolName - - {{ include "llm-d-modelservice.inferencePoolName" . }} - - --poolNamespace - - {{ .Release.Namespace }} - - -v - - "{{ default 4 .Values.routing.epp.debugLevel }}" - - --zap-encoder - - json - - --grpcPort - - "9002" - - --grpcHealthPort - - "9003" - {{- if .Values.routing.epp.pluginsConfigFile }} - - "-configFile" - - "config/{{ .Values.routing.epp.pluginsConfigFile }}" - {{- end}} - {{- with .Values.routing.epp.env }} - env: - {{- toYaml . | nindent 8 }} - {{- end }} - ports: - - containerPort: 9002 - name: grpc - protocol: TCP - - containerPort: 9003 - name: grpc-health - protocol: TCP - - containerPort: 9090 - name: metrics - protocol: TCP - {{- with .Values.routing.epp.extraContainerPorts }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- if (not .Values.routing.epp.disableReadinessProbe) }} - readinessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - {{- end }} - {{- if (not .Values.routing.epp.disableLivenessProbe) }} - livenessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - {{- end }} - {{- if .Values.routing.epp.pluginsConfigFile }} - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: {{ include "llm-d-modelservice.eppConfigName" . }} - {{- end }} - serviceAccount: {{ include "llm-d-modelservice.eppServiceAccountName" . }} - serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml b/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml deleted file mode 100644 index f8cfb260..00000000 --- a/charts/llm-d-modelservice/templates/epp-plugin-configmap.yaml +++ /dev/null @@ -1,108 +0,0 @@ -{{- if and .Values.routing.epp.create .Values.routing.epp.pluginsConfigFile -}} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "llm-d-modelservice.eppConfigName" . }} - namespace: {{ .Release.Namespace }} -data: - default-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: decode-filter - - type: max-score-picker - - type: single-profile-handler - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - prefix-cache-tracking-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - parameters: - mode: cache_tracking - indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods - kvBlockIndexConfig: - enableMetrics: true # enable kv-block index metrics (prometheus) - metricsLoggingInterval: 60000000000 # log kv-block metrics as well (1m in nanoseconds) - - type: kv-cache-scorer # kv-cache-utilization - - type: queue-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 3.0 - - pluginRef: kv-cache-scorer - weight: 1.0 - - pluginRef: queue-scorer - weight: 1.0 - - pluginRef: max-score-picker - prefix-estimate-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - - type: load-aware-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 2.0 - - pluginRef: load-aware-scorer - weight: 1.0 - - pluginRef: max-score-picker - default-pd-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefill-header-handler - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: prefill-filter - - type: decode-filter - - type: max-score-picker - - type: pd-profile-handler - parameters: - threshold: 10 - hashBlockSize: 5 - schedulingProfiles: - - name: prefill - plugins: - - pluginRef: prefill-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - - name: decode - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - {{- if (hasKey .Values.routing.epp "pluginsCustomConfig") }} - {{- .Values.routing.epp.pluginsCustomConfig | toYaml | nindent 2 }} - {{- end }} -{{- end}} diff --git a/charts/llm-d-modelservice/templates/epp-role.yaml b/charts/llm-d-modelservice/templates/epp-role.yaml deleted file mode 100644 index a056cbf7..00000000 --- a/charts/llm-d-modelservice/templates/epp-role.yaml +++ /dev/null @@ -1,44 +0,0 @@ -{{- if .Values.routing.epp.create -}} -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "llm-d-modelservice.eppRoleName" . }} -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - - inferencepools - verbs: - - get - - watch - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - watch - - list -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - watch - - list -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create -{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-rolebinding.yaml b/charts/llm-d-modelservice/templates/epp-rolebinding.yaml deleted file mode 100644 index 65001a02..00000000 --- a/charts/llm-d-modelservice/templates/epp-rolebinding.yaml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.routing.epp.create -}} -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "llm-d-modelservice.eppRoleBindingName" . }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - {{- if .Values.routing.epp.permissions }} - name: {{ .Values.routing.epp.permissions }} - {{- else }} - name: {{ include "llm-d-modelservice.eppRoleName" . }} - {{- end }} -subjects: -- kind: ServiceAccount - name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-sa.yaml b/charts/llm-d-modelservice/templates/epp-sa.yaml deleted file mode 100644 index 53c4f18c..00000000 --- a/charts/llm-d-modelservice/templates/epp-sa.yaml +++ /dev/null @@ -1,8 +0,0 @@ -{{- if .Values.routing.epp.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "llm-d-modelservice.eppServiceAccountName" . }} - labels: - {{- include "llm-d-modelservice.labels" . | nindent 4 }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/epp-service.yaml b/charts/llm-d-modelservice/templates/epp-service.yaml deleted file mode 100644 index 95cb85f1..00000000 --- a/charts/llm-d-modelservice/templates/epp-service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -{{- if .Values.routing.epp.create -}} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "llm-d-modelservice.eppServiceName" . }} - labels: - {{- include "llm-d-modelservice.labels" . | nindent 4 }} -spec: - type: {{ .Values.routing.epp.service.type }} - ports: - - name: grpc-ext-proc - port: {{ .Values.routing.epp.service.port }} - targetPort: {{ .Values.routing.epp.service.targetPort }} - protocol: TCP - appProtocol: {{ .Values.routing.epp.service.appProtocol }} - {{- with .Values.routing.epp.service.extraPorts }} - {{- toYaml . | nindent 4 }} - {{- end }} - selector: - llm-d.ai/epp: {{ include "llm-d-modelservice.eppName" . }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/httproute.yaml b/charts/llm-d-modelservice/templates/httproute.yaml deleted file mode 100644 index 9824bd6d..00000000 --- a/charts/llm-d-modelservice/templates/httproute.yaml +++ /dev/null @@ -1,51 +0,0 @@ - -{{- if .Values.routing.httpRoute.create }} -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: {{ include "llm-d-modelservice.httpRouteName" . }} - namespace: {{ .Release.Namespace }} - labels: - {{- include "llm-d-modelservice.labels" . | nindent 4 }} - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - {{- with .Values.routing.parentRefs }} - parentRefs: - {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 2 }} - {{- end }} - {{- if .Values.routing.httpRoute.rules }} - rules: - {{- with .Values.routing.httpRoute.rules }} - {{- toYaml . | nindent 2 }} - {{- end }} - {{- else }} - rules: - - backendRefs: - - group: {{ .Values.routing.inferencePool.apiGroup }} - kind: InferencePool - name: {{ include "llm-d-modelservice.inferencePoolName" . }} - port: {{ .Values.routing.servicePort }} - weight: 1 - {{- if .Values.routing.httpRoute.timeouts}} - timeouts: - {{- if .Values.routing.httpRoute.timeouts.backendRequest }} - backendRequest: {{ .Values.routing.httpRoute.timeouts.backendRequest }} - {{- end }} - {{- if .Values.routing.httpRoute.timeouts.request }} - request: {{ .Values.routing.httpRoute.timeouts.request }} - {{- end }} - {{- end }} - {{- if .Values.routing.httpRoute.matches }} - matches: - {{- with .Values.routing.httpRoute.matches }} - {{- include "common.tplvalues.render" (dict "value" (include "common.tplvalues.render" (dict "value" . "context" $)) "context" $) | nindent 6 }} - {{- end }} - {{- else }} - matches: - - path: - type: PathPrefix - value: / - {{- end -}} - {{- end }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/inferencemodel.yaml b/charts/llm-d-modelservice/templates/inferencemodel.yaml deleted file mode 100644 index cd30ff18..00000000 --- a/charts/llm-d-modelservice/templates/inferencemodel.yaml +++ /dev/null @@ -1,15 +0,0 @@ -{{- if .Values.routing.inferenceModel.create }} -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} - name: {{ include "llm-d-modelservice.inferenceModelName" . }} -spec: - {{- if .Values.routing.inferenceModel.criticality }} - criticality: {{ .Values.routing.inferenceModel.criticality }} - {{- end}} - modelName: {{ .Values.modelArtifacts.name }} - poolRef: - name: {{ include "llm-d-modelservice.inferencePoolName" . }} -{{- end }} diff --git a/charts/llm-d-modelservice/templates/inferencepool.yaml b/charts/llm-d-modelservice/templates/inferencepool.yaml deleted file mode 100644 index 84906be1..00000000 --- a/charts/llm-d-modelservice/templates/inferencepool.yaml +++ /dev/null @@ -1,20 +0,0 @@ -{{- if .Values.routing.inferencePool.create }} -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: {{ include "llm-d-modelservice.inferencePoolName" . }} - namespace: {{ .Release.Namespace }} -spec: - extensionRef: - failureMode: FailClose - group: "" - kind: Service - {{- if .Values.routing.inferencePool.extensionRef }} - name: {{ .Values.routing.inferencePool.extensionRef }} - {{- else }} - name: {{ include "llm-d-modelservice.eppServiceName" . }} - {{- end }} - selector: - {{- include "llm-d-modelservice.pdlabels" . | nindent 4 }} - targetPortNumber: {{ .Values.routing.servicePort }} -{{- end }} diff --git a/charts/llm-d-modelservice/values.schema.json b/charts/llm-d-modelservice/values.schema.json index 49183535..33a5a3c2 100644 --- a/charts/llm-d-modelservice/values.schema.json +++ b/charts/llm-d-modelservice/values.schema.json @@ -324,6 +324,26 @@ "title": "authSecretName", "type": "string" }, + "labels": { + "additionalProperties": false, + "properties": { + "llm-d.ai/inferenceServing": { + "default": "true", + "required": [], + "title": "llm-d.ai/inferenceServing", + "type": "string" + }, + "llm-d.ai/model": { + "default": "random_model", + "required": [], + "title": "llm-d.ai/model", + "type": "string" + } + }, + "required": [], + "title": "labels", + "type": "object" + }, "mountPath": { "default": "/model-cache", "description": "location where model volume will be mounted (used when mountModelVolume: true)", @@ -513,400 +533,6 @@ "additionalProperties": false, "description": " also describes elements for Gateway API Inference Extension configuration", "properties": { - "epp": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of EPP (endpoint picker) cf. https://github.com/llm-d/llm-d-inference-scheduler", - "properties": { - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - }, - "debugLevel": { - "default": 4, - "required": [], - "title": "debugLevel", - "type": "integer" - }, - "disableLivenessProbe": { - "default": false, - "required": [], - "title": "disableLivenessProbe", - "type": "boolean" - }, - "disableReadinessProbe": { - "default": false, - "required": [], - "title": "disableReadinessProbe", - "type": "boolean" - }, - "env": { - "description": " pluginsCustomConfig: custom-plugins.yaml: | apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: - type: custom-scorer parameters: custom-threshold: 64 - type: max-score-picker - type: single-profile-handler schedulingProfiles: - name: default plugins: - pluginRef: custom-scorer weight: 1 - pluginRef: max-score-picker weight: 1 @schema items: $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar @schema", - "items": { - "description": "EnvVar represents an environment variable present in a Container.", - "properties": { - "name": { - "description": "Name of the environment variable. May consist of any printable ASCII characters except '='.", - "type": "string" - }, - "value": { - "description": "Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. \"$$(VAR_NAME)\" will produce the string literal \"$(VAR_NAME)\". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to \"\".", - "type": "string" - }, - "valueFrom": { - "description": "EnvVarSource represents a source for the value of an EnvVar.", - "properties": { - "configMapKeyRef": { - "description": "Selects a key from a ConfigMap.", - "properties": { - "key": { - "description": "The key to select.", - "type": "string" - }, - "name": { - "description": "Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names", - "type": "string" - }, - "optional": { - "description": "Specify whether the ConfigMap or its key must be defined", - "type": "boolean" - } - }, - "required": [ - "key" - ], - "type": "object", - "x-kubernetes-map-type": "atomic" - }, - "fieldRef": { - "description": "ObjectFieldSelector selects an APIVersioned field of an object.", - "properties": { - "apiVersion": { - "description": "Version of the schema the FieldPath is written in terms of, defaults to \"v1\".", - "type": "string" - }, - "fieldPath": { - "description": "Path of the field to select in the specified API version.", - "type": "string" - } - }, - "required": [ - "fieldPath" - ], - "type": "object", - "x-kubernetes-map-type": "atomic" - }, - "fileKeyRef": { - "description": "FileKeySelector selects a key of the env file.", - "properties": { - "key": { - "description": "The key within the env file. An invalid key will prevent the pod from starting. The keys defined within a source may consist of any printable ASCII characters except '='. During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters.", - "type": "string" - }, - "optional": { - "description": "Specify whether the file or its key must be defined. If the file or key does not exist, then the env var is not published. If optional is set to true and the specified key does not exist, the environment variable will not be set in the Pod's containers.\n\nIf optional is set to false and the specified key does not exist, an error will be returned during Pod creation.", - "type": "boolean" - }, - "path": { - "description": "The path within the volume from which to select the file. Must be relative and may not contain the '..' path or start with '..'.", - "type": "string" - }, - "volumeName": { - "description": "The name of the volume mount containing the env file.", - "type": "string" - } - }, - "required": [ - "volumeName", - "path", - "key" - ], - "type": "object", - "x-kubernetes-map-type": "atomic" - }, - "resourceFieldRef": { - "description": "ResourceFieldSelector represents container resources (cpu, memory) and their output format", - "properties": { - "containerName": { - "description": "Container name: required for volumes, optional for env vars", - "type": "string" - }, - "divisor": { - "oneOf": [ - { - "type": "string" - }, - { - "type": "number" - } - ] - }, - "resource": { - "description": "Required: resource to select", - "type": "string" - } - }, - "required": [ - "resource" - ], - "type": "object", - "x-kubernetes-map-type": "atomic" - }, - "secretKeyRef": { - "description": "SecretKeySelector selects a key of a Secret.", - "properties": { - "key": { - "description": "The key of the secret to select from. Must be a valid secret key.", - "type": "string" - }, - "name": { - "description": "Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names", - "type": "string" - }, - "optional": { - "description": "Specify whether the Secret or its key must be defined", - "type": "boolean" - } - }, - "required": [ - "key" - ], - "type": "object", - "x-kubernetes-map-type": "atomic" - } - }, - "type": "object" - } - }, - "required": [ - "name" - ], - "type": "object" - }, - "required": [], - "title": "env" - }, - "image": { - "default": "ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1", - "required": [], - "title": "image", - "type": "string" - }, - "pluginsConfigFile": { - "default": "default-config.yaml", - "description": " prefix-estimate-config.yaml, default-pd-config.yaml, or you may define a custom config below and select it with the pluginsConfigFile field.", - "required": [], - "title": "pluginsConfigFile" - }, - "replicas": { - "default": 1, - "required": [], - "title": "replicas", - "type": "integer" - }, - "service": { - "additionalProperties": false, - "properties": { - "appProtocol": { - "default": "http2", - "required": [], - "title": "appProtocol", - "type": "string" - }, - "port": { - "default": 9002, - "required": [], - "title": "port", - "type": "integer" - }, - "targetPort": { - "default": 9002, - "required": [], - "title": "targetPort", - "type": "integer" - }, - "type": { - "default": "ClusterIP", - "required": [], - "title": "type", - "type": "string" - } - }, - "required": [], - "title": "service", - "type": "object" - } - }, - "required": [], - "title": "epp" - }, - "httpRoute": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of HTTPRoute (mapping of requests through gateway to InferencePool) cf. https://gateway-api.sigs.k8s.io/api-types/httproute/", - "properties": { - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - }, - "matches": { - "description": " rules: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool name: inference-pool-name port: 8000 weight: 1 matches: - path: type: PathPrefix value: / when specifiying matches and not rules, it will use the default backendRef block but overwrite just the matches section of a single rule", - "items": { - "anyOf": [ - { - "additionalProperties": false, - "properties": { - "headers": { - "description": " @schema items: type: object @schema", - "items": { - "required": [], - "type": "object" - }, - "required": [], - "title": "headers" - }, - "path": { - "additionalProperties": false, - "properties": { - "type": { - "default": "PathPrefix", - "required": [], - "title": "type", - "type": "string" - }, - "value": { - "default": "/", - "required": [], - "title": "value", - "type": "string" - } - }, - "required": [], - "title": "path", - "type": "object" - } - }, - "required": [], - "type": "object" - } - ], - "required": [] - }, - "required": [], - "title": "matches" - }, - "timeouts": { - "additionalProperties": false, - "description": " They are set to 0s which in most situations defaults to the providers max timeout.", - "properties": { - "backendRequest": { - "default": "0s", - "required": [], - "title": "backendRequest", - "type": "string" - }, - "request": { - "default": "0s", - "required": [], - "title": "request", - "type": "string" - } - }, - "required": [], - "title": "timeouts" - } - }, - "required": [], - "title": "httpRoute" - }, - "inferenceModel": { - "additionalProperties": true, - "description": " additionalProperties: true @schema", - "properties": { - "create": { - "default": false, - "required": [], - "title": "create", - "type": "boolean" - }, - "criticality": { - "default": "Critical", - "description": "Criticality options: [\"Critical\", \"Standard\", \"Sheddable\"], see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml#L70-L84", - "required": [], - "title": "criticality", - "type": "string" - } - }, - "required": [], - "title": "inferenceModel" - }, - "inferencePool": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of InferencePool cf. https://gateway-api-inference-extension.sigs.k8s.io/reference/spec/#inferencepool @schema additionalProperties: true @schema", - "properties": { - "apiGroup": { - "default": "inference.networking.x-k8s.io", - "description": " @schema default: \"inference.networking.x-k8s.io\" enum: [\"inference.networking.x-k8s.io\",\"inference.networking.k8s.io\"] @schema", - "enum": [ - "inference.networking.x-k8s.io", - "inference.networking.k8s.io" - ], - "required": [], - "title": "apiGroup" - }, - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - } - }, - "required": [], - "title": "inferencePool" - }, - "parentRefs": { - "description": " cf. https://gateway-api.sigs.k8s.io/api-types/gateway/", - "items": { - "anyOf": [ - { - "additionalProperties": false, - "properties": { - "group": { - "default": "gateway.networking.k8s.io", - "required": [], - "title": "group", - "type": "string" - }, - "kind": { - "default": "Gateway", - "required": [], - "title": "kind", - "type": "string" - }, - "name": { - "default": "inference-gateway", - "required": [], - "title": "name", - "type": "string" - }, - "namespace": { - "default": "{{ .Release.Namespace }}", - "required": [], - "title": "namespace", - "type": "string" - } - }, - "required": [], - "type": "object" - } - ], - "required": [] - }, - "required": [], - "title": "parentRefs" - }, "proxy": { "additionalProperties": true, "description": " additionalProperties: true @schema Configuration of VLLM routing sidecar cf. https://github.com/llm-d/llm-d-routing-sidecar/", diff --git a/charts/llm-d-modelservice/values.schema.tmpl.json b/charts/llm-d-modelservice/values.schema.tmpl.json index 6d801016..527c9286 100644 --- a/charts/llm-d-modelservice/values.schema.tmpl.json +++ b/charts/llm-d-modelservice/values.schema.tmpl.json @@ -324,6 +324,26 @@ "title": "authSecretName", "type": "string" }, + "labels": { + "additionalProperties": false, + "properties": { + "llm-d.ai/inferenceServing": { + "default": "true", + "required": [], + "title": "llm-d.ai/inferenceServing", + "type": "string" + }, + "llm-d.ai/model": { + "default": "random_model", + "required": [], + "title": "llm-d.ai/model", + "type": "string" + } + }, + "required": [], + "title": "labels", + "type": "object" + }, "mountPath": { "default": "/model-cache", "description": "location where model volume will be mounted (used when mountModelVolume: true)", @@ -513,263 +533,6 @@ "additionalProperties": false, "description": " also describes elements for Gateway API Inference Extension configuration", "properties": { - "epp": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of EPP (endpoint picker) cf. https://github.com/llm-d/llm-d-inference-scheduler", - "properties": { - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - }, - "debugLevel": { - "default": 4, - "required": [], - "title": "debugLevel", - "type": "integer" - }, - "disableLivenessProbe": { - "default": false, - "required": [], - "title": "disableLivenessProbe", - "type": "boolean" - }, - "disableReadinessProbe": { - "default": false, - "required": [], - "title": "disableReadinessProbe", - "type": "boolean" - }, - "env": { - "description": " pluginsCustomConfig: custom-plugins.yaml: | apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: - type: custom-scorer parameters: custom-threshold: 64 - type: max-score-picker - type: single-profile-handler schedulingProfiles: - name: default plugins: - pluginRef: custom-scorer weight: 1 - pluginRef: max-score-picker weight: 1 @schema items: $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar @schema", - "items": { - "$ref": "https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar", - "required": [] - }, - "required": [], - "title": "env" - }, - "image": { - "default": "ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1", - "required": [], - "title": "image", - "type": "string" - }, - "pluginsConfigFile": { - "default": "default-config.yaml", - "description": " prefix-estimate-config.yaml, default-pd-config.yaml, or you may define a custom config below and select it with the pluginsConfigFile field.", - "required": [], - "title": "pluginsConfigFile" - }, - "replicas": { - "default": 1, - "required": [], - "title": "replicas", - "type": "integer" - }, - "service": { - "additionalProperties": false, - "properties": { - "appProtocol": { - "default": "http2", - "required": [], - "title": "appProtocol", - "type": "string" - }, - "port": { - "default": 9002, - "required": [], - "title": "port", - "type": "integer" - }, - "targetPort": { - "default": 9002, - "required": [], - "title": "targetPort", - "type": "integer" - }, - "type": { - "default": "ClusterIP", - "required": [], - "title": "type", - "type": "string" - } - }, - "required": [], - "title": "service", - "type": "object" - } - }, - "required": [], - "title": "epp" - }, - "httpRoute": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of HTTPRoute (mapping of requests through gateway to InferencePool) cf. https://gateway-api.sigs.k8s.io/api-types/httproute/", - "properties": { - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - }, - "matches": { - "description": " rules: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool name: inference-pool-name port: 8000 weight: 1 matches: - path: type: PathPrefix value: / when specifiying matches and not rules, it will use the default backendRef block but overwrite just the matches section of a single rule", - "items": { - "anyOf": [ - { - "additionalProperties": false, - "properties": { - "headers": { - "description": " @schema items: type: object @schema", - "items": { - "required": [], - "type": "object" - }, - "required": [], - "title": "headers" - }, - "path": { - "additionalProperties": false, - "properties": { - "type": { - "default": "PathPrefix", - "required": [], - "title": "type", - "type": "string" - }, - "value": { - "default": "/", - "required": [], - "title": "value", - "type": "string" - } - }, - "required": [], - "title": "path", - "type": "object" - } - }, - "required": [], - "type": "object" - } - ], - "required": [] - }, - "required": [], - "title": "matches" - }, - "timeouts": { - "additionalProperties": false, - "description": " They are set to 0s which in most situations defaults to the providers max timeout.", - "properties": { - "backendRequest": { - "default": "0s", - "required": [], - "title": "backendRequest", - "type": "string" - }, - "request": { - "default": "0s", - "required": [], - "title": "request", - "type": "string" - } - }, - "required": [], - "title": "timeouts" - } - }, - "required": [], - "title": "httpRoute" - }, - "inferenceModel": { - "additionalProperties": true, - "description": " additionalProperties: true @schema", - "properties": { - "create": { - "default": false, - "required": [], - "title": "create", - "type": "boolean" - }, - "criticality": { - "default": "Critical", - "description": "Criticality options: [\"Critical\", \"Standard\", \"Sheddable\"], see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml#L70-L84", - "required": [], - "title": "criticality", - "type": "string" - } - }, - "required": [], - "title": "inferenceModel" - }, - "inferencePool": { - "additionalProperties": true, - "description": " additionalProperties: true @schema Configuration of InferencePool cf. https://gateway-api-inference-extension.sigs.k8s.io/reference/spec/#inferencepool @schema additionalProperties: true @schema", - "properties": { - "apiGroup": { - "default": "inference.networking.x-k8s.io", - "description": " @schema default: \"inference.networking.x-k8s.io\" enum: [\"inference.networking.x-k8s.io\",\"inference.networking.k8s.io\"] @schema", - "enum": [ - "inference.networking.x-k8s.io", - "inference.networking.k8s.io" - ], - "required": [], - "title": "apiGroup" - }, - "create": { - "default": true, - "required": [], - "title": "create", - "type": "boolean" - } - }, - "required": [], - "title": "inferencePool" - }, - "parentRefs": { - "description": " cf. https://gateway-api.sigs.k8s.io/api-types/gateway/", - "items": { - "anyOf": [ - { - "additionalProperties": false, - "properties": { - "group": { - "default": "gateway.networking.k8s.io", - "required": [], - "title": "group", - "type": "string" - }, - "kind": { - "default": "Gateway", - "required": [], - "title": "kind", - "type": "string" - }, - "name": { - "default": "inference-gateway", - "required": [], - "title": "name", - "type": "string" - }, - "namespace": { - "default": "{{ .Release.Namespace }}", - "required": [], - "title": "namespace", - "type": "string" - } - }, - "required": [], - "type": "object" - } - ], - "required": [] - }, - "required": [], - "title": "parentRefs" - }, "proxy": { "additionalProperties": true, "description": " additionalProperties: true @schema Configuration of VLLM routing sidecar cf. https://github.com/llm-d/llm-d-routing-sidecar/", diff --git a/charts/llm-d-modelservice/values.yaml b/charts/llm-d-modelservice/values.yaml index 321fe909..1570e27c 100644 --- a/charts/llm-d-modelservice/values.yaml +++ b/charts/llm-d-modelservice/values.yaml @@ -20,6 +20,10 @@ modelArtifacts: # name is the value of the model parameter in OpenAI requests # Required name: random/model + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: random_model + # model URI. One of: # hf://model/name - model as defined on Hugging Face # pvc://pvc_name/path/to/model - model on existing persistant storage volume @@ -92,142 +96,6 @@ routing: # Overwrite the verbosity of logging in the sidecar (defaults to 5) # debugLevel: 5 - # Reference to parent gateway - # cf. https://gateway-api.sigs.k8s.io/api-types/gateway/ - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: "{{ .Release.Namespace }}" - - # @schema - # additionalProperties: true - # @schema - # Configuration of InferencePool - # cf. https://gateway-api-inference-extension.sigs.k8s.io/reference/spec/#inferencepool - - - # @schema - # additionalProperties: true - # @schema - inferencePool: - create: true - # which API group to use for the inference pool, options: ["inference.networking.x-k8s.io", "inference.networking.k8s.io"]. Defaults to: "inference.networking.x-k8s.io" until gateway provider support for new API - # @schema - # default: "inference.networking.x-k8s.io" - # enum: ["inference.networking.x-k8s.io","inference.networking.k8s.io"] - # @schema - apiGroup: "inference.networking.x-k8s.io" - # name: OVERRIDE_NAME - # to use a different epp service than the one created when routing.epp.create: true - # extensionRef: - - # @schema - # additionalProperties: true - # @schema - inferenceModel: - create: false - # Criticality options: ["Critical", "Standard", "Sheddable"], see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml#L70-L84 - criticality: Critical - # Name override for the inference - # name: "some-override" - - # @schema - # additionalProperties: true - # @schema - # Configuration of HTTPRoute (mapping of requests through gateway to InferencePool) - # cf. https://gateway-api.sigs.k8s.io/api-types/httproute/ - httpRoute: - create: true - # when specifiying rules it will overwrite the entire rules block (matches included) - # rules: - # - backendRefs: - # - group: inference.networking.x-k8s.io - # kind: InferencePool - # name: inference-pool-name - # port: 8000 - # weight: 1 - # matches: - # - path: - # type: PathPrefix - # value: / - # when specifiying matches and not rules, it will use the default backendRef block but overwrite just the matches section of a single rule - matches: - - path: - type: PathPrefix - value: / - # example over-riding matches - # @schema - # items: - # type: object - # @schema - headers: [] - # - name: x-model-name - # type: Exact - # value: facebook/opt-125m - - # Allow people to opt out of timeouts by unsetting the default value. - # They are set to 0s which in most situations defaults to the providers max timeout. - timeouts: - backendRequest: 0s - request: 0s - - # @schema - # additionalProperties: true - # @schema - # Configuration of EPP (endpoint picker) - # cf. https://github.com/llm-d/llm-d-inference-scheduler - epp: - create: true - service: - type: ClusterIP - port: 9002 - targetPort: 9002 - appProtocol: http2 - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - replicas: 1 - debugLevel: 4 - disableReadinessProbe: false - disableLivenessProbe: false - # To override the name of the inferencepool - # inferencePool: - # -- Default environment variables for endpoint picker, use `defaultEnvVarsOverride` to override default behavior by defining the same variable again. - # Ref: https://github.com/llm-d/llm-d-inference-scheduler/blob/main/docs/architecture.md#scorers--configuration - - # The name of the plugin file to use. Some default files are provided to you: default-config.yaml, prefix-cache-tracking-config.yaml, - # prefix-estimate-config.yaml, default-pd-config.yaml, or you may define a custom config below and select it with the pluginsConfigFile field. - pluginsConfigFile: "default-config.yaml" - - # Adding a custom plugin config via the pluginsCustomConfig field. Inside there should be an entry to a confimap of file containing the `EndpointPickerConfig` - # pluginsCustomConfig: - # custom-plugins.yaml: | - # apiVersion: inference.networking.x-k8s.io/v1alpha1 - # kind: EndpointPickerConfig - # plugins: - # - type: custom-scorer - # parameters: - # custom-threshold: 64 - # - type: max-score-picker - # - type: single-profile-handler - # schedulingProfiles: - # - name: default - # plugins: - # - pluginRef: custom-scorer - # weight: 1 - # - pluginRef: max-score-picker - # weight: 1 - - - # @schema - # items: - # $ref: https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/master/_definitions.json#/definitions/io.k8s.api.core.v1.EnvVar - # @schema - env: [] - # Include any Environment variables to epp here, or configure scorers for a legacy epp image, ex: - # - name: ENABLE_KVCACHE_AWARE_SCORER - # value: "false" - - # @schema # additionalProperties: true # @schema diff --git a/examples/README.md b/examples/README.md index a27fc5de..da125695 100644 --- a/examples/README.md +++ b/examples/README.md @@ -21,6 +21,8 @@ Note: `alias k=kubectl` | [`values-xpu.yaml`](#5-intel-xpu-examples) | Intel XPU single-node example | Intel Data Center GPU Max | | [`pvc/`](#4-loading-a-model-from-a-pvc) | Persistent volume examples | Shows different storage options | +All the examples assume a `Gateway` and GAIE configuration have been deployed. See the [llm-d guides](https://github.com/llm-d/llm-d/tree/main/guides) for examples. Further, an `HTTPRoute` must be deployed. Some examples of `HTTPRoute` is provided [below](https://github.com/llm-d-incubation/llm-d-modelservice/blob/main/examples/README.md#httproute). + ## Usage Examples ### 1. CPU-only @@ -39,30 +41,6 @@ To install, use `helm install` instead of `helm template`: helm install cpu-sim llm-d-modelservice/llm-d-modelservice -f https://raw.githubusercontent.com/llm-d-incubation/llm-d-modelservice/refs/heads/main/examples/values-cpu.yaml ``` -Port forward the inference gateway service. - -``` -k port-forward svc/llm-d-inference-gateway-istio 8000:80 -``` - -Send a request. - -``` -curl http://localhost:8000/v1/completions -vvv \ - -H "Content-Type: application/json" \ - -H "x-model-name: random/model" \ - -d '{ - "model": "random/model", - "prompt": "Hello, " -}' -``` - -Expect to see a response like the following. - -``` -{"id":"chatcmpl-05cfe79c-234d-4898-b781-3fa59ba7be49","created":1750969231,"model":"random","choices":[{"index":0,"finish_reason":"stop","text":"Alas, poor Yorick! I knew him, Horatio: A fellow of infinite jest"}]} -``` - ### 2. P/D disaggregation Dry-run: @@ -77,39 +55,11 @@ or install in a cluster helm install pd llm-d-modelservice/llm-d-modelservice -f https://raw.githubusercontent.com/llm-d-incubation/llm-d-modelservice/refs/heads/main/examples/values-pd.yaml ``` -Port forward the inference gateway service. - -``` -k port-forward svc/llm-d-inference-gateway-istio 8000:80 -``` - -Send a request, - -``` -curl http://localhost:8000/v1/completions -vvv \ - -H "Content-Type: application/json" \ - -H "x-model-name: facebook/opt-125m" \ - -d '{ - "model": "facebook/opt-125m", - "prompt": "Hello, " -}' -``` - -and expect the following response - -``` -{"choices":[{"finish_reason":"length","index":0,"logprobs":null,"prompt_logprobs":null,"stop_reason":null,"text":" That is my dad. He was a wautdig with a shooting blade on"}],"created":1751031325,"id":"cmpl-aca48bc2-fe95-4c3b-843d-1dbcf94c40c7","kv_transfer_params":null,"model":"facebook/opt-125m","object":"text_completion","usage":{"completion_tokens":16,"prompt_tokens":4,"prompt_tokens_details":null,"total_tokens":20}} -``` - -### 3. Wide Expert Parallelism (EP/DP) with LeaderWorkerSet - -See https://github.com/llm-d/llm-d/blob/main/guides/wide-ep-lws/README.md - -### 4. Loading a model from a PVC +### 3. Loading a model from a PVC See [this README](./pvc/README.md). -### 5. Intel XPU Examples +### 4. Intel XPU Examples For Intel XPU (Data Center GPU Max) deployments: @@ -132,32 +82,68 @@ Get the name of decode pod. kubectl get pods -n llm-d -l llm-d.ai/role=decode ``` -Port forward the decode pod. - -``` -kubectl port-forward -n llm-d pod/$decode_pod_name 8080:8200 & -``` - -Send a request, - -``` -curl -X POST "http://localhost:8080/v1/chat/completions" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "messages": [ - {"role": "user", "content": "Hello!"} - ], - "max_tokens": 50, - "temperature": 0.7 - }' -``` - -and expect the following response - -``` -{"id":"chatcmpl-ebda7f789d434895afec746173e2a4ce","object":"chat.completion","created":1755679402,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"message":{"role":"assistant","content":"Alright, the user said \"Hello!\" and I replied \"Hello! How can I assist you today?\" That's a friendly way to start, let them know I'm here to help.\n\nI should ask them how they're doing or what they need","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":null,"finish_reason":"length","stop_reason":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":7,"total_tokens":57,"completion_tokens":50,"prompt_tokens_details":null},"prompt_logprobs":null,"kv_transfer_params":null}(base) -``` +## HTTPRoute Examples + +An `HTTPRoute` maps requests through a `Gateway` to an `InferencePool` which is, in turn, tied (via match labels) to a particular set of model servers. Here are two examples. + +#### Example: Route all requests to the same model + +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: mymodel-httproute +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: mygateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: inferencepool-for-mymodel + port: 8000 + weight: 1 + matches: + - path: + type: PathPrefix + value: / +``` + +For example, to call the completions API, use `mymodel/v1/completions` + +#### Example: Route requests with modified path + +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: myhttproute +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: mygateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: inferencepool-for-mymodel + port: 8000 + weight: 1 + filters: + - type: URLRewrite + urlRewrite: + path: + replacePrefixMatch: / + type: ReplacePrefixMatch + matches: + - path: + type: PathPrefix + value: /mymodel/ +``` +This route supports requests with prefix `mymodel/`; for example, to call the completions API, use `mymodel/v1/completions` ## Troubleshooting: diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml index 5addc085..35eacd06 100644 --- a/examples/output-cpu.yaml +++ b/examples/output-cpu.yaml @@ -1,15 +1,5 @@ # generated by generate-example-output.sh --- -# Source: llm-d-modelservice/templates/epp-sa.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: cpu-sim-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm ---- # Source: llm-d-modelservice/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount @@ -20,188 +10,6 @@ metadata: app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- -# Source: llm-d-modelservice/templates/epp-plugin-configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: cpu-sim-llm-d-modelservice-epp - namespace: default -data: - default-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: decode-filter - - type: max-score-picker - - type: single-profile-handler - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - prefix-cache-tracking-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - parameters: - mode: cache_tracking - indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods - kvBlockIndexConfig: - enableMetrics: true # enable kv-block index metrics (prometheus) - metricsLoggingInterval: 60000000000 # log kv-block metrics as well (1m in nanoseconds) - - type: kv-cache-scorer # kv-cache-utilization - - type: queue-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 3.0 - - pluginRef: kv-cache-scorer - weight: 1.0 - - pluginRef: queue-scorer - weight: 1.0 - - pluginRef: max-score-picker - prefix-estimate-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - - type: load-aware-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 2.0 - - pluginRef: load-aware-scorer - weight: 1.0 - - pluginRef: max-score-picker - default-pd-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefill-header-handler - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: prefill-filter - - type: decode-filter - - type: max-score-picker - - type: pd-profile-handler - parameters: - threshold: 10 - hashBlockSize: 5 - schedulingProfiles: - - name: prefill - plugins: - - pluginRef: prefill-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - - name: decode - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 ---- -# Source: llm-d-modelservice/templates/epp-role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: cpu-sim-llm-d-modelservice-epp -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - - inferencepools - verbs: - - get - - watch - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - watch - - list -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - watch - - list -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: llm-d-modelservice/templates/epp-rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: cpu-sim-llm-d-modelservice-epp -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: cpu-sim-llm-d-modelservice-epp -subjects: -- kind: ServiceAccount - name: cpu-sim-llm-d-modelservice-epp ---- -# Source: llm-d-modelservice/templates/epp-service.yaml -apiVersion: v1 -kind: Service -metadata: - name: cpu-sim-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: grpc-ext-proc - port: 9002 - targetPort: 9002 - protocol: TCP - appProtocol: http2 - selector: - llm-d.ai/epp: cpu-sim-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/decode-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -216,13 +24,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: cpu-sim-llm-d-modelservice + llm-d.ai/model: random-model llm-d.ai/role: decode template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: cpu-sim-llm-d-modelservice + llm-d.ai/model: random-model llm-d.ai/role: decode spec: initContainers: @@ -285,63 +93,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/epp-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cpu-sim-llm-d-modelservice-epp - labels: - llm-d.ai/epp: cpu-sim-llm-d-modelservice-epp - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - llm-d.ai/epp: cpu-sim-llm-d-modelservice-epp - template: - metadata: - labels: - llm-d.ai/epp: cpu-sim-llm-d-modelservice-epp - spec: - containers: - - name: epp - imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - args: - - --poolName - - cpu-sim-llm-d-modelservice - - --poolNamespace - - default - - -v - - "6" - - --zap-encoder - - json - - --grpcPort - - "9002" - - --grpcHealthPort - - "9003" - - "-configFile" - - "config/default-config.yaml" - ports: - - containerPort: 9002 - name: grpc - protocol: TCP - - containerPort: 9003 - name: grpc-health - protocol: TCP - - containerPort: 9090 - name: metrics - protocol: TCP - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: cpu-sim-llm-d-modelservice-epp - serviceAccount: cpu-sim-llm-d-modelservice-epp - serviceAccountName: cpu-sim-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/prefill-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -356,13 +107,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: cpu-sim-llm-d-modelservice + llm-d.ai/model: random-model llm-d.ai/role: prefill template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: cpu-sim-llm-d-modelservice + llm-d.ai/model: random-model llm-d.ai/role: prefill spec: @@ -407,54 +158,3 @@ spec: volumeMounts: - name: model-storage mountPath: /model-cache ---- -# Source: llm-d-modelservice/templates/inferencepool.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: cpu-sim-llm-d-modelservice - namespace: default -spec: - extensionRef: - failureMode: FailClose - group: "" - kind: Service - name: cpu-sim-llm-d-modelservice-epp - selector: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: cpu-sim-llm-d-modelservice - targetPortNumber: 8000 ---- -# Source: llm-d-modelservice/templates/httproute.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: cpu-sim-llm-d-modelservice - namespace: default - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: 'default' - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: cpu-sim-llm-d-modelservice - port: 8000 - weight: 1 - timeouts: - backendRequest: 0s - request: 0s - matches: - - headers: - - name: x-model-name - type: Exact - value: 'random/model' diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml index 28a53830..b77d7d30 100644 --- a/examples/output-pd.yaml +++ b/examples/output-pd.yaml @@ -1,15 +1,5 @@ # generated by generate-example-output.sh --- -# Source: llm-d-modelservice/templates/epp-sa.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pd-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm ---- # Source: llm-d-modelservice/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount @@ -20,188 +10,6 @@ metadata: app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- -# Source: llm-d-modelservice/templates/epp-plugin-configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: pd-llm-d-modelservice-epp - namespace: default -data: - default-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: decode-filter - - type: max-score-picker - - type: single-profile-handler - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - prefix-cache-tracking-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - parameters: - mode: cache_tracking - indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods - kvBlockIndexConfig: - enableMetrics: true # enable kv-block index metrics (prometheus) - metricsLoggingInterval: 60000000000 # log kv-block metrics as well (1m in nanoseconds) - - type: kv-cache-scorer # kv-cache-utilization - - type: queue-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 3.0 - - pluginRef: kv-cache-scorer - weight: 1.0 - - pluginRef: queue-scorer - weight: 1.0 - - pluginRef: max-score-picker - prefix-estimate-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - - type: load-aware-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 2.0 - - pluginRef: load-aware-scorer - weight: 1.0 - - pluginRef: max-score-picker - default-pd-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefill-header-handler - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: prefill-filter - - type: decode-filter - - type: max-score-picker - - type: pd-profile-handler - parameters: - threshold: 10 - hashBlockSize: 5 - schedulingProfiles: - - name: prefill - plugins: - - pluginRef: prefill-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - - name: decode - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 ---- -# Source: llm-d-modelservice/templates/epp-role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: pd-llm-d-modelservice-epp -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - - inferencepools - verbs: - - get - - watch - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - watch - - list -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - watch - - list -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: llm-d-modelservice/templates/epp-rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: pd-llm-d-modelservice-epp -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pd-llm-d-modelservice-epp -subjects: -- kind: ServiceAccount - name: pd-llm-d-modelservice-epp ---- -# Source: llm-d-modelservice/templates/epp-service.yaml -apiVersion: v1 -kind: Service -metadata: - name: pd-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: grpc-ext-proc - port: 9002 - targetPort: 9002 - protocol: TCP - appProtocol: http2 - selector: - llm-d.ai/epp: pd-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/decode-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -216,13 +24,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode spec: initContainers: @@ -307,81 +115,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/epp-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pd-llm-d-modelservice-epp - labels: - llm-d.ai/epp: pd-llm-d-modelservice-epp - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - llm-d.ai/epp: pd-llm-d-modelservice-epp - template: - metadata: - labels: - llm-d.ai/epp: pd-llm-d-modelservice-epp - spec: - containers: - - name: epp - imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - args: - - --poolName - - pd-llm-d-modelservice - - --poolNamespace - - default - - -v - - "4" - - --zap-encoder - - json - - --grpcPort - - "9002" - - --grpcHealthPort - - "9003" - - "-configFile" - - "config/default-config.yaml" - ports: - - containerPort: 9002 - name: grpc - protocol: TCP - - containerPort: 9003 - name: grpc-health - protocol: TCP - - containerPort: 9090 - name: metrics - protocol: TCP - readinessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - livenessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: pd-llm-d-modelservice-epp - serviceAccount: pd-llm-d-modelservice-epp - serviceAccountName: pd-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/prefill-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -396,13 +129,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill spec: @@ -469,68 +202,3 @@ spec: volumeMounts: - name: model-storage mountPath: /model-cache ---- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice - name: pd-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pd-llm-d-modelservice ---- -# Source: llm-d-modelservice/templates/inferencepool.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: pd-llm-d-modelservice - namespace: default -spec: - extensionRef: - failureMode: FailClose - group: "" - kind: Service - name: pd-llm-d-modelservice-epp - selector: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pd-llm-d-modelservice - targetPortNumber: 8000 ---- -# Source: llm-d-modelservice/templates/httproute.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: pd-llm-d-modelservice - namespace: default - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: 'default' - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: pd-llm-d-modelservice - port: 8000 - weight: 1 - timeouts: - backendRequest: 0s - request: 0s - matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml index dcc60347..f3b07cef 100644 --- a/examples/output-pvc-hf.yaml +++ b/examples/output-pvc-hf.yaml @@ -1,15 +1,5 @@ # generated by generate-example-output.sh --- -# Source: llm-d-modelservice/templates/epp-sa.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pvc-hf-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm ---- # Source: llm-d-modelservice/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount @@ -20,188 +10,6 @@ metadata: app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- -# Source: llm-d-modelservice/templates/epp-plugin-configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: pvc-hf-llm-d-modelservice-epp - namespace: default -data: - default-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: decode-filter - - type: max-score-picker - - type: single-profile-handler - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - prefix-cache-tracking-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - parameters: - mode: cache_tracking - indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods - kvBlockIndexConfig: - enableMetrics: true # enable kv-block index metrics (prometheus) - metricsLoggingInterval: 60000000000 # log kv-block metrics as well (1m in nanoseconds) - - type: kv-cache-scorer # kv-cache-utilization - - type: queue-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 3.0 - - pluginRef: kv-cache-scorer - weight: 1.0 - - pluginRef: queue-scorer - weight: 1.0 - - pluginRef: max-score-picker - prefix-estimate-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - - type: load-aware-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 2.0 - - pluginRef: load-aware-scorer - weight: 1.0 - - pluginRef: max-score-picker - default-pd-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefill-header-handler - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: prefill-filter - - type: decode-filter - - type: max-score-picker - - type: pd-profile-handler - parameters: - threshold: 10 - hashBlockSize: 5 - schedulingProfiles: - - name: prefill - plugins: - - pluginRef: prefill-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - - name: decode - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 ---- -# Source: llm-d-modelservice/templates/epp-role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: pvc-hf-llm-d-modelservice-epp -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - - inferencepools - verbs: - - get - - watch - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - watch - - list -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - watch - - list -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: llm-d-modelservice/templates/epp-rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: pvc-hf-llm-d-modelservice-epp -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pvc-hf-llm-d-modelservice-epp -subjects: -- kind: ServiceAccount - name: pvc-hf-llm-d-modelservice-epp ---- -# Source: llm-d-modelservice/templates/epp-service.yaml -apiVersion: v1 -kind: Service -metadata: - name: pvc-hf-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: grpc-ext-proc - port: 9002 - targetPort: 9002 - protocol: TCP - appProtocol: http2 - selector: - llm-d.ai/epp: pvc-hf-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/decode-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -216,13 +24,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode spec: initContainers: @@ -307,81 +115,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/epp-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pvc-hf-llm-d-modelservice-epp - labels: - llm-d.ai/epp: pvc-hf-llm-d-modelservice-epp - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - llm-d.ai/epp: pvc-hf-llm-d-modelservice-epp - template: - metadata: - labels: - llm-d.ai/epp: pvc-hf-llm-d-modelservice-epp - spec: - containers: - - name: epp - imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - args: - - --poolName - - pvc-hf-llm-d-modelservice - - --poolNamespace - - default - - -v - - "4" - - --zap-encoder - - json - - --grpcPort - - "9002" - - --grpcHealthPort - - "9003" - - "-configFile" - - "config/default-config.yaml" - ports: - - containerPort: 9002 - name: grpc - protocol: TCP - - containerPort: 9003 - name: grpc-health - protocol: TCP - - containerPort: 9090 - name: metrics - protocol: TCP - readinessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - livenessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: pvc-hf-llm-d-modelservice-epp - serviceAccount: pvc-hf-llm-d-modelservice-epp - serviceAccountName: pvc-hf-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/prefill-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -396,13 +129,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill spec: @@ -469,68 +202,3 @@ spec: volumeMounts: - name: model-storage mountPath: /model-cache ---- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice - name: pvc-hf-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pvc-hf-llm-d-modelservice ---- -# Source: llm-d-modelservice/templates/inferencepool.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: pvc-hf-llm-d-modelservice - namespace: default -spec: - extensionRef: - failureMode: FailClose - group: "" - kind: Service - name: pvc-hf-llm-d-modelservice-epp - selector: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-hf-llm-d-modelservice - targetPortNumber: 8000 ---- -# Source: llm-d-modelservice/templates/httproute.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: pvc-hf-llm-d-modelservice - namespace: default - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: 'default' - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: pvc-hf-llm-d-modelservice - port: 8000 - weight: 1 - timeouts: - backendRequest: 0s - request: 0s - matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml index 371ec5ac..0a98402f 100644 --- a/examples/output-pvc.yaml +++ b/examples/output-pvc.yaml @@ -1,15 +1,5 @@ # generated by generate-example-output.sh --- -# Source: llm-d-modelservice/templates/epp-sa.yaml -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pvc-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm ---- # Source: llm-d-modelservice/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount @@ -20,188 +10,6 @@ metadata: app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- -# Source: llm-d-modelservice/templates/epp-plugin-configmap.yaml -apiVersion: v1 -kind: ConfigMap -metadata: - name: pvc-llm-d-modelservice-epp - namespace: default -data: - default-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: decode-filter - - type: max-score-picker - - type: single-profile-handler - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - prefix-cache-tracking-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - parameters: - mode: cache_tracking - indexerConfig: - tokenProcessorConfig: - blockSize: 64 # must match vLLM block size if not default (16) - hashSeed: "42" # must match PYTHONHASHSEED in vLLM pods - kvBlockIndexConfig: - enableMetrics: true # enable kv-block index metrics (prometheus) - metricsLoggingInterval: 60000000000 # log kv-block metrics as well (1m in nanoseconds) - - type: kv-cache-scorer # kv-cache-utilization - - type: queue-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 3.0 - - pluginRef: kv-cache-scorer - weight: 1.0 - - pluginRef: queue-scorer - weight: 1.0 - - pluginRef: max-score-picker - prefix-estimate-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: single-profile-handler - - type: decode-filter - - type: prefix-cache-scorer - - type: load-aware-scorer - - type: max-score-picker - schedulingProfiles: - - name: default - plugins: - - pluginRef: decode-filter - - pluginRef: prefix-cache-scorer - weight: 2.0 - - pluginRef: load-aware-scorer - weight: 1.0 - - pluginRef: max-score-picker - default-pd-config.yaml: | - apiVersion: inference.networking.x-k8s.io/v1alpha1 - kind: EndpointPickerConfig - plugins: - - type: prefill-header-handler - - type: prefix-cache-scorer - parameters: - hashBlockSize: 5 - maxPrefixBlocksToMatch: 256 - lruCapacityPerServer: 31250 - - type: prefill-filter - - type: decode-filter - - type: max-score-picker - - type: pd-profile-handler - parameters: - threshold: 10 - hashBlockSize: 5 - schedulingProfiles: - - name: prefill - plugins: - - pluginRef: prefill-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 - - name: decode - plugins: - - pluginRef: decode-filter - - pluginRef: max-score-picker - - pluginRef: prefix-cache-scorer - weight: 50 ---- -# Source: llm-d-modelservice/templates/epp-role.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: pvc-llm-d-modelservice-epp -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - - inferencepools - verbs: - - get - - watch - - list -- apiGroups: - - "" - resources: - - pods - verbs: - - get - - watch - - list -- apiGroups: - - discovery.k8s.io - resources: - - endpointslices - verbs: - - get - - watch - - list -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create ---- -# Source: llm-d-modelservice/templates/epp-rolebinding.yaml -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: pvc-llm-d-modelservice-epp -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: pvc-llm-d-modelservice-epp -subjects: -- kind: ServiceAccount - name: pvc-llm-d-modelservice-epp ---- -# Source: llm-d-modelservice/templates/epp-service.yaml -apiVersion: v1 -kind: Service -metadata: - name: pvc-llm-d-modelservice-epp - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - ports: - - name: grpc-ext-proc - port: 9002 - targetPort: 9002 - protocol: TCP - appProtocol: http2 - selector: - llm-d.ai/epp: pvc-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/decode-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -216,13 +24,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: decode spec: initContainers: @@ -305,81 +113,6 @@ spec: - name: model-storage mountPath: /model-cache --- -# Source: llm-d-modelservice/templates/epp-deployment.yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pvc-llm-d-modelservice-epp - labels: - llm-d.ai/epp: pvc-llm-d-modelservice-epp - namespace: default -spec: - replicas: 1 - selector: - matchLabels: - llm-d.ai/epp: pvc-llm-d-modelservice-epp - template: - metadata: - labels: - llm-d.ai/epp: pvc-llm-d-modelservice-epp - spec: - containers: - - name: epp - imagePullPolicy: Always - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - args: - - --poolName - - pvc-llm-d-modelservice - - --poolNamespace - - default - - -v - - "4" - - --zap-encoder - - json - - --grpcPort - - "9002" - - --grpcHealthPort - - "9003" - - "-configFile" - - "config/default-config.yaml" - ports: - - containerPort: 9002 - name: grpc - protocol: TCP - - containerPort: 9003 - name: grpc-health - protocol: TCP - - containerPort: 9090 - name: metrics - protocol: TCP - readinessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - livenessProbe: - grpc: - port: 9003 - service: envoy.service.ext_proc.v3.ExternalProcessor - initialDelaySeconds: 5 - timeoutSeconds: 1 - periodSeconds: 10 - successThreshold: 1 - failureThreshold: 3 - volumeMounts: - - name: plugins-config-volume - mountPath: "/config" - volumes: - - name: plugins-config-volume - configMap: - name: pvc-llm-d-modelservice-epp - serviceAccount: pvc-llm-d-modelservice-epp - serviceAccountName: pvc-llm-d-modelservice-epp ---- # Source: llm-d-modelservice/templates/prefill-deployment.yaml apiVersion: apps/v1 kind: Deployment @@ -394,13 +127,13 @@ spec: selector: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill template: metadata: labels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice + llm-d.ai/model: facebook-opt-125m llm-d.ai/role: prefill spec: @@ -465,68 +198,3 @@ spec: volumeMounts: - name: model-storage mountPath: /model-cache ---- -# Source: llm-d-modelservice/templates/inferencemodel.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferenceModel -metadata: - labels: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice - name: pvc-llm-d-modelservice -spec: - criticality: Critical - modelName: facebook/opt-125m - poolRef: - name: pvc-llm-d-modelservice ---- -# Source: llm-d-modelservice/templates/inferencepool.yaml -apiVersion: inference.networking.x-k8s.io/v1alpha2 -kind: InferencePool -metadata: - name: pvc-llm-d-modelservice - namespace: default -spec: - extensionRef: - failureMode: FailClose - group: "" - kind: Service - name: pvc-llm-d-modelservice-epp - selector: - llm-d.ai/inferenceServing: "true" - llm-d.ai/model: pvc-llm-d-modelservice - targetPortNumber: 8000 ---- -# Source: llm-d-modelservice/templates/httproute.yaml -apiVersion: gateway.networking.k8s.io/v1 -kind: HTTPRoute -metadata: - name: pvc-llm-d-modelservice - namespace: default - labels: - helm.sh/chart: llm-d-modelservice-v0.2.15 - app.kubernetes.io/version: "v0.2.0" - app.kubernetes.io/managed-by: Helm - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: 'default' - rules: - - backendRefs: - - group: inference.networking.x-k8s.io - kind: InferencePool - name: pvc-llm-d-modelservice - port: 8000 - weight: 1 - timeouts: - backendRequest: 0s - request: 0s - matches: - - headers: - - name: x-model-name - type: Exact - value: 'facebook/opt-125m' diff --git a/examples/values-cpu.yaml b/examples/values-cpu.yaml index be78d181..c79a7e9f 100644 --- a/examples/values-cpu.yaml +++ b/examples/values-cpu.yaml @@ -8,6 +8,9 @@ multinode: false modelArtifacts: # name is the value of the model parameter in OpenAI requests name: random/model + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: random-model uri: "hf://{{ .Values.modelArtifacts.name }}" size: 5Mi @@ -17,20 +20,6 @@ routing: proxy: secure: false - httpRoute: - create: true - matches: - - headers: - - name: x-model-name - type: Exact - value: "{{ .Values.modelArtifacts.name }}" - - epp: - create: true - debugLevel: 6 - disableReadinessProbe: true - disableLivenessProbe: true - # Decode pod configuation decode: replicas: 1 diff --git a/examples/values-pd.yaml b/examples/values-pd.yaml index fe0e849f..7c1f8355 100644 --- a/examples/values-pd.yaml +++ b/examples/values-pd.yaml @@ -8,6 +8,9 @@ multinode: false modelArtifacts: # This is the model name used to start vLLM. name: facebook/opt-125m + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: facebook-opt-125m uri: hf://"{{ .Values.modelArtifacts.name }}" size: 20Gi @@ -20,31 +23,6 @@ routing: proxy: secure: false - # to change name of inference-gateway - # parentRefs: - # - name: inference-gateway - - inferencePool: - create: true - - # required for certain gateways (e.g. Kgateway) but not others (Istio) - # creating this so that it works for all gateways - inferenceModel: - create: true - - httpRoute: - create: true - matches: - - headers: - - name: x-model-name - type: Exact - value: "{{ .Values.modelArtifacts.name }}" - - epp: - create: true - # Inherit all from base chart values.yaml - # env: - # Decode pod configuation decode: create: true diff --git a/examples/values-xpu-pd.yaml b/examples/values-xpu-pd.yaml index 54889c2a..74e4f8b2 100644 --- a/examples/values-xpu-pd.yaml +++ b/examples/values-xpu-pd.yaml @@ -7,6 +7,9 @@ modelArtifacts: name: microsoft/DialoGPT-large + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: microsoft-dialogpt-large uri: "hf://microsoft/DialoGPT-large" size: 10Gi @@ -21,29 +24,6 @@ routing: # Use compatible connector for XPU connector: nixlv2 - parentRefs: - - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - namespace: "{{ .Release.Namespace }}" - - httpRoute: - create: true - matches: - - path: - type: PathPrefix - value: / - headers: - - name: x-model-name - type: Exact - value: "{{ .Values.modelArtifacts.name }}" - - epp: - create: true - image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1 - # Use XPU-compatible configuration - pluginsConfigFile: "default-config.yaml" - # Decode pod configuration for Intel XPU decode: create: true diff --git a/examples/values-xpu.yaml b/examples/values-xpu.yaml index c76177e7..82719ebf 100644 --- a/examples/values-xpu.yaml +++ b/examples/values-xpu.yaml @@ -3,6 +3,9 @@ modelArtifacts: name: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B + labels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: deepseek-ai-deepSeek-r1-distill-qwen-1-5B uri: "hf://deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" size: 10Gi From b500f3ccf96f777c10305e4dcb3efb8e635d29e8 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 22 Oct 2025 09:53:00 -0400 Subject: [PATCH 2/4] update chart version, readme Signed-off-by: Michael Kalantar --- README.md | 2 +- charts/llm-d-modelservice/Chart.yaml | 2 +- examples/README.md | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5c8054f8..fcb75ec0 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Note that when using the GAIE InferencePool chart together with the Modelservice ### HTTPRoute -In addition to deploying the GAIE chart, an `HTTPRoute` is typically required to connect the `Gateway` to the `InferencePool`. Creating an HTTPRoute is not part of either chart. Some examples are provided [here](https://github.com/llm-d-incubation/llm-d-modelservice/blob/main/examples/README.md#httproute) +In addition to deploying the GAIE chart, an `HTTPRoute` is typically required to connect the `Gateway` to the `InferencePool`. Creating an HTTPRoute is not part of either chart. Some examples are provided [here](https://github.com/llm-d-incubation/llm-d-modelservice/blob/main/examples/README.md#httproute). ## Examples diff --git a/charts/llm-d-modelservice/Chart.yaml b/charts/llm-d-modelservice/Chart.yaml index 5f3e1e5c..1f78ca56 100644 --- a/charts/llm-d-modelservice/Chart.yaml +++ b/charts/llm-d-modelservice/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: "v0.2.16" +version: "v0.3.0" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/examples/README.md b/examples/README.md index da125695..03dd32eb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,10 +8,6 @@ helm repo add llm-d-modelservice https://llm-d-incubation.github.io/llm-d-models helm repo update ``` -Note: `alias k=kubectl` - -> If you only want to deploy model instances without routing support, append `--set inferencePool=false --set httpRoute=false` to the example commands. - ## Available Examples | Example | Description | Hardware Requirements | @@ -27,8 +23,6 @@ All the examples assume a `Gateway` and GAIE configuration have been deployed. ### 1. CPU-only -Make sure there is a gateway (Kgateway or Istio) deployed in the cluster. Follow [these instructions](https://gateway-api-inference-extension.sigs.k8s.io/guides/#__tabbed_3_2) on how to set up a gateway. Once done, update `routing.parentRefs[*].name` in this [values file](values-cpu.yaml#L18) to use the name for the Gateway (`llm-d-inference-gateway-istio`) in the cluster or override with the `--set "routing.parentRefs[0].name=MYGATEWAY"` flag. - Dry run: ``` @@ -97,12 +91,12 @@ spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway - name: mygateway + name: INSERT_GATEWAY_NAME rules: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: inferencepool-for-mymodel + name: INSERT_INFERENCEPOOL_NAME port: 8000 weight: 1 matches: @@ -111,7 +105,7 @@ spec: value: / ``` -For example, to call the completions API, use `mymodel/v1/completions` +For example, to call the OpenAI completions API, use `mymodel/v1/completions` #### Example: Route requests with modified path @@ -124,12 +118,12 @@ spec: parentRefs: - group: gateway.networking.k8s.io kind: Gateway - name: mygateway + name: INSERT_GATEWAY_NAME rules: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: inferencepool-for-mymodel + name: INSERT_INFERENCEPOOL_NAME port: 8000 weight: 1 filters: @@ -143,7 +137,7 @@ spec: type: PathPrefix value: /mymodel/ ``` -This route supports requests with prefix `mymodel/`; for example, to call the completions API, use `mymodel/v1/completions` +This route supports requests with the prefix `mymodel/`; for example, to call the OpenAI completions API, requests would be sent to: `mymodel/v1/completions`. The HTTPRoute maps rewrites such requests to `v1/completions` for the target model server. ## Troubleshooting: From 90c1b0bab2c032aa3e1afe6e8470c5bf129fb5d9 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 22 Oct 2025 09:53:58 -0400 Subject: [PATCH 3/4] update examples Signed-off-by: Michael Kalantar --- examples/output-cpu.yaml | 6 +++--- examples/output-pd.yaml | 6 +++--- examples/output-pvc-hf.yaml | 6 +++--- examples/output-pvc.yaml | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/output-cpu.yaml b/examples/output-cpu.yaml index 8d4aaf74..913e6478 100644 --- a/examples/output-cpu.yaml +++ b/examples/output-cpu.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: cpu-sim-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: cpu-sim-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -99,7 +99,7 @@ kind: Deployment metadata: name: cpu-sim-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: diff --git a/examples/output-pd.yaml b/examples/output-pd.yaml index 707ce0b8..72ab5f61 100644 --- a/examples/output-pd.yaml +++ b/examples/output-pd.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pd-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pd-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -121,7 +121,7 @@ kind: Deployment metadata: name: pd-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: diff --git a/examples/output-pvc-hf.yaml b/examples/output-pvc-hf.yaml index 2f96a515..192bfab9 100644 --- a/examples/output-pvc-hf.yaml +++ b/examples/output-pvc-hf.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pvc-hf-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pvc-hf-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -121,7 +121,7 @@ kind: Deployment metadata: name: pvc-hf-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: diff --git a/examples/output-pvc.yaml b/examples/output-pvc.yaml index 27e7543a..6c1ebf13 100644 --- a/examples/output-pvc.yaml +++ b/examples/output-pvc.yaml @@ -6,7 +6,7 @@ kind: ServiceAccount metadata: name: pvc-llm-d-modelservice labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm --- @@ -16,7 +16,7 @@ kind: Deployment metadata: name: pvc-llm-d-modelservice-decode labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: @@ -119,7 +119,7 @@ kind: Deployment metadata: name: pvc-llm-d-modelservice-prefill labels: - helm.sh/chart: llm-d-modelservice-v0.2.16 + helm.sh/chart: llm-d-modelservice-v0.3.0 app.kubernetes.io/version: "v0.2.0" app.kubernetes.io/managed-by: Helm spec: From 04777a25f0dc38bd6e017dde36ea604336dba2fb Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 24 Oct 2025 08:50:26 -0400 Subject: [PATCH 4/4] document labels better --- README.md | 3 ++- charts/llm-d-modelservice/values.schema.json | 4 ++-- charts/llm-d-modelservice/values.schema.tmpl.json | 4 ++-- charts/llm-d-modelservice/values.yaml | 4 ++++ 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fcb75ec0..c30c4e16 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,11 @@ Once a model is deployed, inference requests must be routed to it. To do this, t ### Relationships -Note that when using the GAIE InferencePool chart together with the Modelservice chart the following relationships will exist: +Note that when using the GAIE [inferencepool chart](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/config/charts/inferencepool) together with the modelservice chart the following relationships will exist: - The modelservice field `modelArtifact.routing.servicePort` should match the GAIE field `inferencePool.targetPortNumber` or be an entry in the list `inferencePool.targets` (depending on the apiVersion of InferencePool). - The modelservice field `modelArtifact.labels` should match the GAIE field, `inferencePool.modelServers.matchLabels`. +Note that the field `llm-d.ai/role` will be addition in addition to the labels specified in the `modelArtifacts.labels` field. ### HTTPRoute diff --git a/charts/llm-d-modelservice/values.schema.json b/charts/llm-d-modelservice/values.schema.json index ac8c4d37..9b4c2bef 100644 --- a/charts/llm-d-modelservice/values.schema.json +++ b/charts/llm-d-modelservice/values.schema.json @@ -332,6 +332,7 @@ }, "labels": { "additionalProperties": false, + "description": " These should match the labels of any associated InferencePool In addition, the label llm-d.ai/role will be added with the value 'prefill' or 'decode' depending on the role of the pod.", "properties": { "llm-d.ai/inferenceServing": { "default": "true", @@ -347,8 +348,7 @@ } }, "required": [], - "title": "labels", - "type": "object" + "title": "labels" }, "mountPath": { "default": "/model-cache", diff --git a/charts/llm-d-modelservice/values.schema.tmpl.json b/charts/llm-d-modelservice/values.schema.tmpl.json index 909a646b..cb507ecb 100644 --- a/charts/llm-d-modelservice/values.schema.tmpl.json +++ b/charts/llm-d-modelservice/values.schema.tmpl.json @@ -332,6 +332,7 @@ }, "labels": { "additionalProperties": false, + "description": " These should match the labels of any associated InferencePool In addition, the label llm-d.ai/role will be added with the value 'prefill' or 'decode' depending on the role of the pod.", "properties": { "llm-d.ai/inferenceServing": { "default": "true", @@ -347,8 +348,7 @@ } }, "required": [], - "title": "labels", - "type": "object" + "title": "labels" }, "mountPath": { "default": "/model-cache", diff --git a/charts/llm-d-modelservice/values.yaml b/charts/llm-d-modelservice/values.yaml index 8c10e6e7..a06b528b 100644 --- a/charts/llm-d-modelservice/values.yaml +++ b/charts/llm-d-modelservice/values.yaml @@ -20,6 +20,10 @@ modelArtifacts: # name is the value of the model parameter in OpenAI requests # Required name: random/model + # Labels that will be added to the pods serving the model. + # These should match the labels of any associated InferencePool + # In addition, the label llm-d.ai/role will be added with the value 'prefill' + # or 'decode' depending on the role of the pod. labels: llm-d.ai/inferenceServing: "true" llm-d.ai/model: random_model