Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 29 additions & 31 deletions .github/values-06-session-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@ servingEngineSpec:
type: Recreate
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
Expand All @@ -18,53 +17,51 @@ servingEngineSpec:
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: "9000"
workerPort: "8001"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"

- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: "9000"
workerPort: "8002"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -82,6 +79,7 @@ routerSpec:
enableRouter: true
routingLogic: "session"
sessionKey: "x-user-id"
lmcacheControllerPort: 9000
extraArgs:
- "--log-level"
- "info"
61 changes: 30 additions & 31 deletions .github/values-07-prefix-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@ servingEngineSpec:
type: Recreate
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
Expand All @@ -18,53 +17,51 @@ servingEngineSpec:
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: "9000"
workerPort: "8001"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"

- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: "9000"
workerPort: "8002"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -81,6 +78,8 @@ routerSpec:
type: Recreate
enableRouter: true
routingLogic: "prefixaware"
sessionKey: "x-user-id"
lmcacheControllerPort: 9000
extraArgs:
- "--log-level"
- "info"
61 changes: 30 additions & 31 deletions .github/values-08-roundrobin-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@ servingEngineSpec:
type: Recreate
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
Expand All @@ -18,53 +17,51 @@ servingEngineSpec:
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: "9000"
workerPort: "8001"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"

- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6

lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: "9000"
workerPort: "8002"

env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"

chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -81,6 +78,8 @@ routerSpec:
type: Recreate
enableRouter: true
routingLogic: "roundrobin"
sessionKey: "x-user-id"
lmcacheControllerPort: 9000
extraArgs:
- "--log-level"
- "info"
Loading
Loading