Skip to content

Commit d6b9a1d

Browse files
modify CI
Signed-off-by: Rui Zhang <[email protected]>
1 parent 1726f57 commit d6b9a1d

File tree

6 files changed

+84
-143
lines changed

6 files changed

+84
-143
lines changed

.github/values-06-session-routing.yaml

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,60 @@ servingEngineSpec:
44
runtimeClassName: ""
55
modelSpec:
66
# Prefill node configuration
7-
- name: "opt125m-prefill"
7+
- name: "opt125m-1"
88
repository: "lmcache/vllm-openai"
9-
tag: "latest"
9+
tag: "v0.3.5"
1010
modelURL: "facebook/opt-125m"
1111
replicaCount: 1
1212
requestCPU: 6
1313
requestMemory: "30Gi"
14-
# requestGPU: 1
14+
requestGPU: 1
1515
pvcStorage: "50Gi"
1616
vllmConfig:
1717
enablePrefixCaching: true
1818
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
19+
gpuMemoryUtilization: 0.8
2120
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2321
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
22+
cpuOffloadingBufferSize: "60"
3423
enableController: true
24+
instanceId: "default1"
3525
controllerPort: 9000
3626
workerPort: 8001
3727
distributedUrl: "localhost:30081"
38-
labels:
39-
model: "opt125m-prefill"
28+
env:
29+
- name: LMCACHE_LOG_LEVEL
30+
value: "DEBUG"
4031
chatTemplate: "chat.jinja2"
4132
chatTemplateConfigMap: |-
4233
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
4334
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
4435
# Decode node configuration
45-
- name: "opt125m-decode"
36+
- name: "opt125m-2"
4637
repository: "lmcache/vllm-openai"
47-
tag: "latest"
38+
tag: "v0.3.5"
4839
modelURL: "facebook/opt-125m"
4940
replicaCount: 1
5041
requestCPU: 6
5142
requestMemory: "30Gi"
52-
# requestGPU: 1
43+
requestGPU: 1
5344
pvcStorage: "50Gi"
5445
vllmConfig:
5546
enablePrefixCaching: true
5647
maxModelLen: 1024
5748
v1: 1
49+
gpuMemoryUtilization: 0.6
5850
lmcacheConfig:
59-
cudaVisibleDevices: "1"
6051
enabled: true
61-
kvRole: "kv_consumer" # Set decode node as consumer
62-
enableNixl: true
63-
nixlRole: "receiver"
64-
nixlPeerHost: "0.0.0.0"
65-
nixlPeerPort: "55555"
66-
nixlBufferSize: "1073741824" # 1GB
67-
nixlBufferDevice: "cuda"
68-
nixlEnableGc: true
69-
enablePD: true
52+
cpuOffloadingBufferSize: "60"
7053
enableController: true
54+
instanceId: "default2"
7155
controllerPort: 9000
7256
workerPort: 8002
7357
distributedUrl: "localhost:30082"
74-
labels:
75-
model: "opt125m-decode"
58+
env:
59+
- name: LMCACHE_LOG_LEVEL
60+
value: "DEBUG"
7661
chatTemplate: "chat.jinja2"
7762
chatTemplateConfigMap: |-
7863
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}

.github/values-07-prefix-routing.yaml

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,60 @@ servingEngineSpec:
44
runtimeClassName: ""
55
modelSpec:
66
# Prefill node configuration
7-
- name: "opt125m-prefill"
7+
- name: "opt125m-1"
88
repository: "lmcache/vllm-openai"
9-
tag: "latest"
9+
tag: "v0.3.5"
1010
modelURL: "facebook/opt-125m"
1111
replicaCount: 1
1212
requestCPU: 6
1313
requestMemory: "30Gi"
14-
# requestGPU: 1
14+
requestGPU: 1
1515
pvcStorage: "50Gi"
1616
vllmConfig:
1717
enablePrefixCaching: true
1818
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
19+
gpuMemoryUtilization: 0.8
2120
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2321
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
22+
cpuOffloadingBufferSize: "60"
3423
enableController: true
24+
instanceId: "default1"
3525
controllerPort: 9000
3626
workerPort: 8001
3727
distributedUrl: "localhost:30081"
38-
labels:
39-
model: "opt125m-prefill"
28+
env:
29+
- name: LMCACHE_LOG_LEVEL
30+
value: "DEBUG"
4031
chatTemplate: "chat.jinja2"
4132
chatTemplateConfigMap: |-
4233
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
4334
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
4435
# Decode node configuration
45-
- name: "opt125m-decode"
36+
- name: "opt125m-2"
4637
repository: "lmcache/vllm-openai"
47-
tag: "latest"
38+
tag: "v0.3.5"
4839
modelURL: "facebook/opt-125m"
4940
replicaCount: 1
5041
requestCPU: 6
5142
requestMemory: "30Gi"
52-
# requestGPU: 1
43+
requestGPU: 1
5344
pvcStorage: "50Gi"
5445
vllmConfig:
5546
enablePrefixCaching: true
5647
maxModelLen: 1024
5748
v1: 1
49+
gpuMemoryUtilization: 0.6
5850
lmcacheConfig:
59-
cudaVisibleDevices: "1"
6051
enabled: true
61-
kvRole: "kv_consumer" # Set decode node as consumer
62-
enableNixl: true
63-
nixlRole: "receiver"
64-
nixlPeerHost: "0.0.0.0"
65-
nixlPeerPort: "55555"
66-
nixlBufferSize: "1073741824" # 1GB
67-
nixlBufferDevice: "cuda"
68-
nixlEnableGc: true
69-
enablePD: true
52+
cpuOffloadingBufferSize: "60"
7053
enableController: true
54+
instanceId: "default2"
7155
controllerPort: 9000
7256
workerPort: 8002
7357
distributedUrl: "localhost:30082"
74-
labels:
75-
model: "opt125m-decode"
58+
env:
59+
- name: LMCACHE_LOG_LEVEL
60+
value: "DEBUG"
7661
chatTemplate: "chat.jinja2"
7762
chatTemplateConfigMap: |-
7863
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}

.github/values-08-roundrobin-routing.yaml

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,60 @@ servingEngineSpec:
44
runtimeClassName: ""
55
modelSpec:
66
# Prefill node configuration
7-
- name: "opt125m-prefill"
7+
- name: "opt125m-1"
88
repository: "lmcache/vllm-openai"
9-
tag: "latest"
9+
tag: "v0.3.5"
1010
modelURL: "facebook/opt-125m"
1111
replicaCount: 1
1212
requestCPU: 6
1313
requestMemory: "30Gi"
14-
# requestGPU: 1
14+
requestGPU: 1
1515
pvcStorage: "50Gi"
1616
vllmConfig:
1717
enablePrefixCaching: true
1818
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
19+
gpuMemoryUtilization: 0.8
2120
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2321
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
22+
cpuOffloadingBufferSize: "60"
3423
enableController: true
24+
instanceId: "default1"
3525
controllerPort: 9000
3626
workerPort: 8001
3727
distributedUrl: "localhost:30081"
38-
labels:
39-
model: "opt125m-prefill"
28+
env:
29+
- name: LMCACHE_LOG_LEVEL
30+
value: "DEBUG"
4031
chatTemplate: "chat.jinja2"
4132
chatTemplateConfigMap: |-
4233
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
4334
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
4435
# Decode node configuration
45-
- name: "opt125m-decode"
36+
- name: "opt125m-2"
4637
repository: "lmcache/vllm-openai"
47-
tag: "latest"
38+
tag: "v0.3.5"
4839
modelURL: "facebook/opt-125m"
4940
replicaCount: 1
5041
requestCPU: 6
5142
requestMemory: "30Gi"
52-
# requestGPU: 1
43+
requestGPU: 1
5344
pvcStorage: "50Gi"
5445
vllmConfig:
5546
enablePrefixCaching: true
5647
maxModelLen: 1024
5748
v1: 1
49+
gpuMemoryUtilization: 0.6
5850
lmcacheConfig:
59-
cudaVisibleDevices: "1"
6051
enabled: true
61-
kvRole: "kv_consumer" # Set decode node as consumer
62-
enableNixl: true
63-
nixlRole: "receiver"
64-
nixlPeerHost: "0.0.0.0"
65-
nixlPeerPort: "55555"
66-
nixlBufferSize: "1073741824" # 1GB
67-
nixlBufferDevice: "cuda"
68-
nixlEnableGc: true
69-
enablePD: true
52+
cpuOffloadingBufferSize: "60"
7053
enableController: true
54+
instanceId: "default2"
7155
controllerPort: 9000
7256
workerPort: 8002
7357
distributedUrl: "localhost:30082"
74-
labels:
75-
model: "opt125m-decode"
58+
env:
59+
- name: LMCACHE_LOG_LEVEL
60+
value: "DEBUG"
7661
chatTemplate: "chat.jinja2"
7762
chatTemplateConfigMap: |-
7863
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}

.github/values-09-kvaware-routing.yaml

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,75 +4,60 @@ servingEngineSpec:
44
runtimeClassName: ""
55
modelSpec:
66
# Prefill node configuration
7-
- name: "opt125m-prefill"
7+
- name: "opt125m-1"
88
repository: "lmcache/vllm-openai"
9-
tag: "latest"
9+
tag: "v0.3.5"
1010
modelURL: "facebook/opt-125m"
1111
replicaCount: 1
1212
requestCPU: 6
1313
requestMemory: "30Gi"
14-
# requestGPU: 1
14+
requestGPU: 1
1515
pvcStorage: "50Gi"
1616
vllmConfig:
1717
enablePrefixCaching: true
1818
maxModelLen: 1024
19-
v1: 1
20-
gpuMemoryUtilization: 0.6
19+
gpuMemoryUtilization: 0.8
2120
lmcacheConfig:
22-
cudaVisibleDevices: "0"
2321
enabled: true
24-
kvRole: "kv_producer"
25-
enableNixl: true
26-
nixlRole: "sender"
27-
nixlPeerHost: "vllm-opt125m-decode-engine-service"
28-
nixlPeerPort: "55555"
29-
nixlBufferSize: "1073741824" # 1GB
30-
nixlBufferDevice: "cuda"
31-
nixlEnableGc: true
32-
enablePD: true
33-
cpuOffloadingBufferSize: 0
22+
cpuOffloadingBufferSize: "60"
3423
enableController: true
24+
instanceId: "default1"
3525
controllerPort: 9000
3626
workerPort: 8001
3727
distributedUrl: "localhost:30081"
38-
labels:
39-
model: "opt125m-prefill"
28+
env:
29+
- name: LMCACHE_LOG_LEVEL
30+
value: "DEBUG"
4031
chatTemplate: "chat.jinja2"
4132
chatTemplateConfigMap: |-
4233
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
4334
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
4435
# Decode node configuration
45-
- name: "opt125m-decode"
36+
- name: "opt125m-2"
4637
repository: "lmcache/vllm-openai"
47-
tag: "latest"
38+
tag: "v0.3.5"
4839
modelURL: "facebook/opt-125m"
4940
replicaCount: 1
5041
requestCPU: 6
5142
requestMemory: "30Gi"
52-
# requestGPU: 1
43+
requestGPU: 1
5344
pvcStorage: "50Gi"
5445
vllmConfig:
5546
enablePrefixCaching: true
5647
maxModelLen: 1024
5748
v1: 1
49+
gpuMemoryUtilization: 0.6
5850
lmcacheConfig:
59-
cudaVisibleDevices: "1"
6051
enabled: true
61-
kvRole: "kv_consumer" # Set decode node as consumer
62-
enableNixl: true
63-
nixlRole: "receiver"
64-
nixlPeerHost: "0.0.0.0"
65-
nixlPeerPort: "55555"
66-
nixlBufferSize: "1073741824" # 1GB
67-
nixlBufferDevice: "cuda"
68-
nixlEnableGc: true
69-
enablePD: true
52+
cpuOffloadingBufferSize: "60"
7053
enableController: true
54+
instanceId: "default2"
7155
controllerPort: 9000
7256
workerPort: 8002
7357
distributedUrl: "localhost:30082"
74-
labels:
75-
model: "opt125m-decode"
58+
env:
59+
- name: LMCACHE_LOG_LEVEL
60+
value: "DEBUG"
7661
chatTemplate: "chat.jinja2"
7762
chatTemplateConfigMap: |-
7863
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}

helm/templates/deployment-vllm-multi.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ spec:
109109
{{- if or
110110
(eq $modelSpec.tag "2025-05-27-v1")
111111
(eq $modelSpec.tag "2025-05-17-v1")
112+
(eq $modelSpec.tag "v0.3.5")
112113
(eq $modelSpec.tag "latest-nightly")
113114
(hasPrefix "nightly-" $modelSpec.tag)
114115
(and

0 commit comments

Comments
 (0)