Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 36 additions & 35 deletions .github/values-06-session-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,60 @@ servingEngineSpec:
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
gpuMemoryUtilization: 0.8
lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: 9000
workerPort: 8001
distributedUrl: "localhost:30081"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"
- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: 9000
workerPort: 8002
distributedUrl: "localhost:30082"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -81,6 +74,14 @@ routerSpec:
type: Recreate
enableRouter: true
routingLogic: "session"
resources:
requests:
cpu: "1"
memory: "2G"
limits:
cpu: "1"
memory: "2G"
lmcacheControllerPort: 9000
sessionKey: "x-user-id"
extraArgs:
- "--log-level"
Expand Down
71 changes: 36 additions & 35 deletions .github/values-07-prefix-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,60 @@ servingEngineSpec:
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
gpuMemoryUtilization: 0.8
lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: 9000
workerPort: 8001
distributedUrl: "localhost:30081"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"
- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: 9000
workerPort: 8002
distributedUrl: "localhost:30082"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -79,8 +72,16 @@ routerSpec:
imagePullPolicy: "IfNotPresent"
strategy:
type: Recreate
resources:
requests:
cpu: "1"
memory: "2G"
limits:
cpu: "1"
memory: "2G"
enableRouter: true
routingLogic: "prefixaware"
extraArgs:
- "--log-level"
- "info"
lmcacheControllerPort: 9000
71 changes: 36 additions & 35 deletions .github/values-08-roundrobin-routing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,67 +4,60 @@ servingEngineSpec:
runtimeClassName: ""
modelSpec:
# Prefill node configuration
- name: "opt125m-prefill"
- name: "opt125m-1"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
gpuMemoryUtilization: 0.8
lmcacheConfig:
cudaVisibleDevices: "0"
enabled: true
kvRole: "kv_producer"
enableNixl: true
nixlRole: "sender"
nixlPeerHost: "vllm-opt125m-decode-engine-service"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
cpuOffloadingBufferSize: 0
labels:
model: "opt125m-prefill"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default1"
controllerPort: 9000
workerPort: 8001
distributedUrl: "localhost:30081"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
# Decode node configuration
- name: "opt125m-decode"
- name: "opt125m-2"
repository: "lmcache/vllm-openai"
tag: "2025-05-27-v1"
tag: "v0.3.5"
modelURL: "facebook/opt-125m"
replicaCount: 1
requestCPU: 8
requestCPU: 6
requestMemory: "30Gi"
# requestGPU: 1
requestGPU: 1
pvcStorage: "50Gi"
vllmConfig:
enablePrefixCaching: true
maxModelLen: 1024
v1: 1
gpuMemoryUtilization: 0.6
lmcacheConfig:
cudaVisibleDevices: "1"
enabled: true
kvRole: "kv_consumer" # Set decode node as consumer
enableNixl: true
nixlRole: "receiver"
nixlPeerHost: "0.0.0.0"
nixlPeerPort: "55555"
nixlBufferSize: "1073741824" # 1GB
nixlBufferDevice: "cuda"
nixlEnableGc: true
enablePD: true
labels:
model: "opt125m-decode"
cpuOffloadingBufferSize: "10"
enableController: true
instanceId: "default2"
controllerPort: 9000
workerPort: 8002
distributedUrl: "localhost:30082"
env:
- name: LMCACHE_LOG_LEVEL
value: "DEBUG"
chatTemplate: "chat.jinja2"
chatTemplateConfigMap: |-
{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
Expand All @@ -84,3 +77,11 @@ routerSpec:
extraArgs:
- "--log-level"
- "info"
resources:
requests:
cpu: "1"
memory: "2G"
limits:
cpu: "1"
memory: "2G"
lmcacheControllerPort: 9000
Loading
Loading