Skip to content

Commit b8461b6

Browse files
authored
chore: updated health checks to use new probes (#2124)
1 parent 222245e commit b8461b6

File tree

5 files changed

+182
-120
lines changed

5 files changed

+182
-120
lines changed

components/backends/vllm/deploy/agg.yaml

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,19 @@ spec:
4848
VllmDecodeWorker:
4949
envFromSecret: hf-token-secret
5050
livenessProbe:
51-
exec:
52-
command:
53-
- /bin/sh
54-
- -c
55-
- "exit 0"
56-
periodSeconds: 60
51+
httpGet:
52+
path: /live
53+
port: 9090
54+
periodSeconds: 5
5755
timeoutSeconds: 30
58-
failureThreshold: 10
56+
failureThreshold: 1
5957
readinessProbe:
60-
exec:
61-
command:
62-
- /bin/sh
63-
- -c
64-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
65-
initialDelaySeconds: 60
66-
periodSeconds: 60
58+
httpGet:
59+
path: /health
60+
port: 9090
61+
periodSeconds: 10
6762
timeoutSeconds: 30
68-
failureThreshold: 10
63+
failureThreshold: 60
6964
dynamoNamespace: vllm-agg
7065
componentType: worker
7166
replicas: 1
@@ -78,8 +73,21 @@ spec:
7873
cpu: "10"
7974
memory: "20Gi"
8075
gpu: "1"
76+
envs:
77+
- name: DYN_SYSTEM_ENABLED
78+
value: "true"
79+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
80+
value: "[\"generate\"]"
81+
- name: DYN_SYSTEM_PORT
82+
value: "9090"
8183
extraPodSpec:
8284
mainContainer:
85+
startupProbe:
86+
httpGet:
87+
path: /health
88+
port: 9090
89+
periodSeconds: 10
90+
failureThreshold: 60
8391
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
8492
workingDir: /workspace/components/backends/vllm
8593
command:

components/backends/vllm/deploy/agg_router.yaml

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,19 @@ spec:
4848
VllmDecodeWorker:
4949
envFromSecret: hf-token-secret
5050
livenessProbe:
51-
exec:
52-
command:
53-
- /bin/sh
54-
- -c
55-
- "exit 0"
56-
periodSeconds: 60
51+
httpGet:
52+
path: /live
53+
port: 9090
54+
periodSeconds: 5
5755
timeoutSeconds: 30
58-
failureThreshold: 10
56+
failureThreshold: 1
5957
readinessProbe:
60-
exec:
61-
command:
62-
- /bin/sh
63-
- -c
64-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
65-
initialDelaySeconds: 60
66-
periodSeconds: 60
58+
httpGet:
59+
path: /health
60+
port: 9090
61+
periodSeconds: 10
6762
timeoutSeconds: 30
68-
failureThreshold: 10
63+
failureThreshold: 60
6964
dynamoNamespace: vllm-agg-router
7065
componentType: worker
7166
replicas: 2
@@ -78,8 +73,21 @@ spec:
7873
cpu: "10"
7974
memory: "20Gi"
8075
gpu: "1"
76+
envs:
77+
- name: DYN_SYSTEM_ENABLED
78+
value: "true"
79+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
80+
value: "[\"generate\"]"
81+
- name: DYN_SYSTEM_PORT
82+
value: "9090"
8183
extraPodSpec:
8284
mainContainer:
85+
startupProbe:
86+
httpGet:
87+
path: /health
88+
port: 9090
89+
periodSeconds: 10
90+
failureThreshold: 60
8391
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
8492
workingDir: /workspace/components/backends/vllm
8593
command:

components/backends/vllm/deploy/disagg.yaml

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -51,24 +51,19 @@ spec:
5151
componentType: worker
5252
replicas: 1
5353
livenessProbe:
54-
exec:
55-
command:
56-
- /bin/sh
57-
- -c
58-
- "exit 0"
59-
periodSeconds: 60
54+
httpGet:
55+
path: /live
56+
port: 9090
57+
periodSeconds: 5
6058
timeoutSeconds: 30
61-
failureThreshold: 10
59+
failureThreshold: 1
6260
readinessProbe:
63-
exec:
64-
command:
65-
- /bin/sh
66-
- -c
67-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
68-
initialDelaySeconds: 60
69-
periodSeconds: 60
61+
httpGet:
62+
path: /health
63+
port: 9090
64+
periodSeconds: 10
7065
timeoutSeconds: 30
71-
failureThreshold: 10
66+
failureThreshold: 60
7267
resources:
7368
requests:
7469
cpu: "32"
@@ -78,8 +73,21 @@ spec:
7873
cpu: "32"
7974
memory: "40Gi"
8075
gpu: "1"
76+
envs:
77+
- name: DYN_SYSTEM_ENABLED
78+
value: "true"
79+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
80+
value: "[\"generate\"]"
81+
- name: DYN_SYSTEM_PORT
82+
value: "9090"
8183
extraPodSpec:
8284
mainContainer:
85+
startupProbe:
86+
httpGet:
87+
path: /health
88+
port: 9090
89+
periodSeconds: 10
90+
failureThreshold: 60
8391
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
8492
workingDir: /workspace/components/backends/vllm
8593
command:
@@ -93,24 +101,19 @@ spec:
93101
componentType: worker
94102
replicas: 1
95103
livenessProbe:
96-
exec:
97-
command:
98-
- /bin/sh
99-
- -c
100-
- "exit 0"
101-
periodSeconds: 60
104+
httpGet:
105+
path: /live
106+
port: 9090
107+
periodSeconds: 5
102108
timeoutSeconds: 30
103-
failureThreshold: 10
109+
failureThreshold: 1
104110
readinessProbe:
105-
exec:
106-
command:
107-
- /bin/sh
108-
- -c
109-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
110-
initialDelaySeconds: 60
111-
periodSeconds: 60
111+
httpGet:
112+
path: /health
113+
port: 9090
114+
periodSeconds: 10
112115
timeoutSeconds: 30
113-
failureThreshold: 10
116+
failureThreshold: 60
114117
resources:
115118
requests:
116119
cpu: "32"
@@ -120,8 +123,21 @@ spec:
120123
cpu: "32"
121124
memory: "40Gi"
122125
gpu: "1"
126+
envs:
127+
- name: DYN_SYSTEM_ENABLED
128+
value: "true"
129+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
130+
value: "[\"generate\"]"
131+
- name: DYN_SYSTEM_PORT
132+
value: "9090"
123133
extraPodSpec:
124134
mainContainer:
135+
startupProbe:
136+
httpGet:
137+
path: /health
138+
port: 9090
139+
periodSeconds: 10
140+
failureThreshold: 60
125141
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
126142
workingDir: /workspace/components/backends/vllm
127143
command:

components/backends/vllm/deploy/disagg_planner.yaml

Lines changed: 46 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -51,24 +51,19 @@ spec:
5151
componentType: worker
5252
replicas: 1
5353
livenessProbe:
54-
exec:
55-
command:
56-
- /bin/sh
57-
- -c
58-
- "exit 0"
59-
periodSeconds: 60
54+
httpGet:
55+
path: /live
56+
port: 9090
57+
periodSeconds: 5
6058
timeoutSeconds: 30
61-
failureThreshold: 10
59+
failureThreshold: 1
6260
readinessProbe:
63-
exec:
64-
command:
65-
- /bin/sh
66-
- -c
67-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
68-
initialDelaySeconds: 60
69-
periodSeconds: 60
61+
httpGet:
62+
path: /health
63+
port: 9090
64+
periodSeconds: 10
7065
timeoutSeconds: 30
71-
failureThreshold: 10
66+
failureThreshold: 60
7267
resources:
7368
requests:
7469
cpu: "10"
@@ -78,8 +73,21 @@ spec:
7873
cpu: "10"
7974
memory: "20Gi"
8075
gpu: "1"
76+
envs:
77+
- name: DYN_SYSTEM_ENABLED
78+
value: "true"
79+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
80+
value: "[\"generate\"]"
81+
- name: DYN_SYSTEM_PORT
82+
value: "9090"
8183
extraPodSpec:
8284
mainContainer:
85+
startupProbe:
86+
httpGet:
87+
path: /health
88+
port: 9090
89+
periodSeconds: 10
90+
failureThreshold: 60
8391
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
8492
workingDir: /workspace/components/backends/vllm
8593
command:
@@ -93,24 +101,19 @@ spec:
93101
componentType: worker
94102
replicas: 1
95103
livenessProbe:
96-
exec:
97-
command:
98-
- /bin/sh
99-
- -c
100-
- "exit 0"
101-
periodSeconds: 60
104+
httpGet:
105+
path: /health
106+
port: 9090
107+
periodSeconds: 5
102108
timeoutSeconds: 30
103-
failureThreshold: 10
109+
failureThreshold: 1
104110
readinessProbe:
105-
exec:
106-
command:
107-
- /bin/sh
108-
- -c
109-
- 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
110-
initialDelaySeconds: 60
111-
periodSeconds: 60
111+
httpGet:
112+
path: /health
113+
port: 9090
114+
periodSeconds: 10
112115
timeoutSeconds: 30
113-
failureThreshold: 10
116+
failureThreshold: 60
114117
resources:
115118
requests:
116119
cpu: "10"
@@ -120,8 +123,21 @@ spec:
120123
cpu: "10"
121124
memory: "20Gi"
122125
gpu: "1"
126+
envs:
127+
- name: DYN_SYSTEM_ENABLED
128+
value: "true"
129+
- name: DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS
130+
value: "[\"generate\"]"
131+
- name: DYN_SYSTEM_PORT
132+
value: "9090"
123133
extraPodSpec:
124134
mainContainer:
135+
startupProbe:
136+
httpGet:
137+
path: /health
138+
port: 9090
139+
periodSeconds: 10
140+
failureThreshold: 60
125141
image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
126142
workingDir: /workspace/components/backends/vllm
127143
command:

0 commit comments

Comments
 (0)