Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 2 additions & 11 deletions .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,9 @@ jobs:
id: filter_spec_decode
uses: dorny/paths-filter@v3
with:
# speculative decode seems will cause oom issue, disable it now on ci test
filters: |
speculative_tests_changed:
- "tests/singlecard/spec_decode/**"
- "tests/multicard/spec_decode_e2e/**"
- "vllm_ascend/worker/worker.py"
- "vllm_ascend/worker/model_runner.py"
- "vllm_ascend/worker/multi_step_runner.py"
- "vllm_ascend/worker/multi_step_worker.py"
- "vllm_ascend/worker/draft_model_runner.py"
- "vllm_ascend/patch/worker/patch_common/patch_metrics.py"
- "vllm_ascend/patch/worker/patch_common/patch_spec_decode_worker.py"
- "vllm_ascend/patch/worker/patch_common/patch_multi_step_worker.py"
speculative_tests_changed: 'false'

- name: Run vllm-project/vllm-ascend Speculative Decode test
if: steps.filter_spec_decode.outputs.speculative_tests_changed == 'true' || github.event_name == 'schedule'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (NPU 0,1 for prefill and NPU 2,3 for decode),
and then transfer the KV cache between them.
prompy_device_ips denotes device ip of NPU 0,1
decode_device_ips denotes device ip of NPU 2,3
The device ips of all NPUs in current server can be found through
examples/disaggregated_prefill/find_device_ips.py
"""
import multiprocessing as mp
import os
import time
from multiprocessing import Event, Process

kv_connector_extra_config = {
Comment thread
whx-sjtu marked this conversation as resolved.
"prompt_device_ips": ["1.2.3.1", "1.2.3.2"],
"decode_device_ips": ["1.2.3.9", "1.2.3.10"],
"llmdatadist_comm_port": 26000,
}


def clean_up():
import gc
Expand All @@ -34,11 +44,10 @@ def run_prefill(prefill_done, process_close):
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_producer", "kv_parallel_size":2}'
'{"kv_connector":"AscendSimpleConnector","kv_buffer_device":"npu","kv_role":"kv_producer", "kv_parallel_size":2}'
)

# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
global kv_connector_extra_config
ktc.kv_connector_extra_config = kv_connector_extra_config
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
Comment thread
whx-sjtu marked this conversation as resolved.
kv_transfer_config=ktc,
max_model_len=2000,
Expand Down Expand Up @@ -69,15 +78,16 @@ def run_decode(prefill_done):
from vllm.config import KVTransferConfig

prompts = [
"Hello, how are you today?", "Hi, what is your name?",
"Tell me a very long story.", "what is your favourite book?"
"Hello, how are you today?",
"Hi, what is your name?",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95)

ktc = KVTransferConfig.from_cli(
'{"kv_connector":"AscendHcclConnector","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":2}'
'{"kv_connector":"AscendSimpleConnector","kv_buffer_device":"npu","kv_role":"kv_consumer","kv_parallel_size":2}'
)

global kv_connector_extra_config
ktc.kv_connector_extra_config = kv_connector_extra_config
llm = LLM(model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
kv_transfer_config=ktc,
max_model_len=2000,
Expand Down
Loading