Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions charts/llm-d-modelservice/templates/epp-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,22 @@ spec:
imagePullPolicy: Always
image: {{ required "routing.epp.image must be specified" .Values.routing.epp.image }}
args:
- --poolName
- --pool-name
- {{ include "llm-d-modelservice.inferencePoolName" . }}
- --poolNamespace
- --pool-namespace
- {{ .Release.Namespace }}
- -v
- "{{ default 4 .Values.routing.epp.debugLevel }}"
- "--pool-group"
- "inference.networking.x-k8s.io"
- --v
- "4"
- --zap-encoder
- json
- --grpcPort
- "json"
- --grpc-port
- "9002"
- --grpcHealthPort
- --grpc-health-port
- "9003"
{{- if .Values.routing.epp.pluginsConfigFile }}
- "-configFile"
- "--config-file"
- "config/{{ .Values.routing.epp.pluginsConfigFile }}"
{{- end}}
{{- with .Values.routing.epp.env }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,15 @@ data:
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: prefill-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: prefill
- type: decode-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: decode
- type: max-score-picker
- type: pd-profile-handler
parameters:
Expand Down
2 changes: 1 addition & 1 deletion charts/llm-d-modelservice/templates/epp-role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ rules:
- apiGroups:
- inference.networking.x-k8s.io
resources:
- inferencemodels
- inferenceobjectives
- inferencepools
verbs:
- get
Expand Down
1 change: 0 additions & 1 deletion charts/llm-d-modelservice/templates/httproute.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ spec:
- group: {{ .Values.routing.inferencePool.apiGroup }}
kind: InferencePool
name: {{ include "llm-d-modelservice.inferencePoolName" . }}
port: {{ .Values.routing.servicePort }}
weight: 1
{{- if .Values.routing.httpRoute.timeouts}}
timeouts:
Expand Down
6 changes: 3 additions & 3 deletions charts/llm-d-modelservice/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ routing:

# Allow people to opt out of timeouts by unsetting the default value.
# They are set to 0s which in most situations defaults to the providers max timeout.
timeouts:
backendRequest: 0s
request: 0s
# timeouts:
# backendRequest: 0s
# request: 0s

# @schema
# additionalProperties: true
Expand Down
32 changes: 19 additions & 13 deletions examples/output-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,15 @@ data:
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: prefill-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: prefill
- type: decode-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: decode
- type: max-score-picker
- type: pd-profile-handler
parameters:
Expand Down Expand Up @@ -134,7 +142,7 @@ rules:
- apiGroups:
- inference.networking.x-k8s.io
resources:
- inferencemodels
- inferenceobjectives
- inferencepools
verbs:
- get
Expand Down Expand Up @@ -308,19 +316,21 @@ spec:
imagePullPolicy: Always
image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
args:
- --poolName
- --pool-name
- cpu-sim-llm-d-modelservice
- --poolNamespace
- --pool-namespace
- default
- -v
- "6"
- "--pool-group"
- "inference.networking.x-k8s.io"
- --v
- "4"
- --zap-encoder
- json
- --grpcPort
- "json"
- --grpc-port
- "9002"
- --grpcHealthPort
- --grpc-health-port
- "9003"
- "-configFile"
- "--config-file"
- "config/default-config.yaml"
ports:
- containerPort: 9002
Expand Down Expand Up @@ -448,11 +458,7 @@ spec:
- group: inference.networking.x-k8s.io
kind: InferencePool
name: cpu-sim-llm-d-modelservice
port: 8000
weight: 1
timeouts:
backendRequest: 0s
request: 0s
matches:
- headers:
- name: x-model-name
Expand Down
76 changes: 36 additions & 40 deletions examples/output-pd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,15 @@ data:
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: prefill-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: prefill
- type: decode-filter
parameters:
selector:
matchLabels:
llm-d.ai/role: decode
- type: max-score-picker
- type: pd-profile-handler
parameters:
Expand Down Expand Up @@ -134,7 +142,7 @@ rules:
- apiGroups:
- inference.networking.x-k8s.io
resources:
- inferencemodels
- inferenceobjectives
- inferencepools
verbs:
- get
Expand Down Expand Up @@ -254,7 +262,7 @@ spec:

containers:
- name: vllm
image: ghcr.io/llm-d/llm-d:v0.2.0
image: ghcr.io/llm-d/llm-d-cuda:v0.3.0

command: ["vllm", "serve"]
args:
Expand All @@ -277,7 +285,7 @@ spec:
fieldRef:
fieldPath: status.podIP
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5557"
value: "5600"
- name: VLLM_LOGGING_LEVEL
value: DEBUG
- name: DP_SIZE
Expand All @@ -292,14 +300,16 @@ spec:
protocol: TCP
- containerPort: 5557
protocol: TCP
- containerPort: 5600
protocol: TCP

resources:
limits:
cpu: "16"
cpu: "6"
memory: 16Gi
nvidia.com/gpu: "1"
requests:
cpu: "16"
cpu: "6"
memory: 16Gi
nvidia.com/gpu: "1"

Expand Down Expand Up @@ -328,22 +338,24 @@ spec:
containers:
- name: epp
imagePullPolicy: Always
image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.2.1
image: ghcr.io/llm-d/llm-d-inference-scheduler:v0.3.1
args:
- --poolName
- --pool-name
- pd-llm-d-modelservice
- --poolNamespace
- --pool-namespace
- default
- -v
- "--pool-group"
- "inference.networking.x-k8s.io"
- --v
- "4"
- --zap-encoder
- json
- --grpcPort
- "json"
- --grpc-port
- "9002"
- --grpcHealthPort
- --grpc-health-port
- "9003"
- "-configFile"
- "config/default-config.yaml"
- "--config-file"
- "config/default-pd-config.yaml"
ports:
- containerPort: 9002
name: grpc
Expand Down Expand Up @@ -417,7 +429,7 @@ spec:

containers:
- name: vllm
image: ghcr.io/llm-d/llm-d:v0.2.0
image: ghcr.io/llm-d/llm-d-cuda:v0.3.0

command: ["vllm", "serve"]
args:
Expand All @@ -436,7 +448,7 @@ spec:
- name: UCX_TLS
value: cuda_ipc,cuda_copy,tcp
- name: VLLM_NIXL_SIDE_CHANNEL_PORT
value: "5557"
value: "5600"
- name: VLLM_NIXL_SIDE_CHANNEL_HOST
valueFrom:
fieldRef:
Expand All @@ -455,35 +467,23 @@ spec:
protocol: TCP
- containerPort: 5557
protocol: TCP
- containerPort: 5600
protocol: TCP

resources:
limits:
cpu: "16"
cpu: "6"
memory: 16Gi
nvidia.com/gpu: "1"
requests:
cpu: "16"
cpu: "6"
memory: 16Gi
nvidia.com/gpu: "1"

volumeMounts:
- name: model-storage
mountPath: /model-cache
---
# Source: llm-d-modelservice/templates/inferencemodel.yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferenceModel
metadata:
labels:
llm-d.ai/inferenceServing: "true"
llm-d.ai/model: pd-llm-d-modelservice
name: pd-llm-d-modelservice
spec:
criticality: Critical
modelName: facebook/opt-125m
poolRef:
name: pd-llm-d-modelservice
---
# Source: llm-d-modelservice/templates/inferencepool.yaml
apiVersion: inference.networking.x-k8s.io/v1alpha2
kind: InferencePool
Expand Down Expand Up @@ -524,13 +524,9 @@ spec:
- group: inference.networking.x-k8s.io
kind: InferencePool
name: pd-llm-d-modelservice
port: 8000
weight: 1
timeouts:
backendRequest: 0s
request: 0s
matches:
- headers:
- name: x-model-name
type: Exact
value: 'facebook/opt-125m'
- headers: []
path:
type: PathPrefix
value: /
Loading
Loading