From 326d373a59e7f20fcbf220d038feaf47465f8f99 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Wed, 24 Dec 2025 15:56:03 +0800 Subject: [PATCH 1/8] Qwen3-235B perf doc Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 266 +++++++++++++++++++++++ 1 file changed, 266 insertions(+) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 38b23111cc4..ba985b41c1c 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -311,3 +311,269 @@ vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random ``` After about several minutes, you can get the performance evaluation result. + + +## Best Performance Tutorial + +In this section, we provide simple scripts to re-produce our latest performance. + +### Single Node A3 + +On a single Atlas 800 A3(64G*16)server, the recommended parallel setup is `--data-parallel-size 4` && `--tensor-parallel-size 4` && `--enable-expert-parallel `. Example server scripts: +```shell +#!/bin/sh +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=true +# To reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_BUFFSIZE=512 +export HCCL_OP_EXPANSION_MODE="AIV" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=1 +export TASK_QUEUE_ENABLE=1 + +vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--host 0.0.0.0 \ +--port 8000 \ +--tensor-parallel-size 4 \ +--data-parallel-size 4 \ +--seed 1024 \ +--quantization ascend \ +--served-model-name qwen3 \ +--max-num-seqs 128 \ +--max-model-len 40960 \ +--max-num-batched-tokens 16384 \ +--enable-expert-parallel \ +--trust-remote-code \ +--gpu-memory-utilization 0.9 \ +--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--async-scheduling +``` + +Benchmark scripts: +```shell +vllm bench serve --model qwen \ +--tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--ignore-eos \ +--dataset-name random \ +--random-input-len 3584 \ +--random-output-len 1536 \ +--num-prompts 800 \ +--max-concurrency 160 \ +--request-rate inf \ +--host 0.0.0.0 \ +--port 8000 \ +``` + +### Three Node A3 -- PD disaggregation + +On three Atlas 800 A3(64G*16)server, we recommend to use one node as one prefill instance and two nodes as one decode instance. Example server scripts: +Prefill Node 1 +```shell +export HCCL_IF_IP=prefill_node_1_ip + +ifname="" + +export GLOO_SOCKET_IFNAME=${ifname} +export TP_SOCKET_IFNAME=${ifname} +export HCCL_SOCKET_IFNAME=${ifname} + +#!/bin/sh +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=true +# To reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_BUFFSIZE=512 +export HCCL_OP_EXPANSION_MODE="AIV" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=2 +export TASK_QUEUE_ENABLE=1 + +vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--host 0.0.0.0 \ +--port 8000 \ +--tensor-parallel-size 4 \ +--data-parallel-size 8 \ +--data-parallel-size-local 4 \ +--data-parallel-start-rank 0 \ +--data-parallel-address prefill_node_1_ip \ +--data-parallel-rpc-port prefill_node_dp_port \ +--seed 1024 \ +--quantization ascend \ +--served-model-name qwen3 \ +--max-num-seqs 32 \ +--max-model-len 40960 \ +--max-num-batched-tokens 16384 \ +--enable-expert-parallel \ +--enforce-eager \ +--trust-remote-code \ +--gpu-memory-utilization 0.9 \ +--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--async-scheduling \ +--kv-transfer-config \ +'{"kv_connector": "MooncakeConnector", +"kv_role": "kv_producer", +"kv_port": "30000", +"engine_id": "0", +"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", +"kv_connector_extra_config": { + "use_ascend_direct": true, + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } +} +}' +``` +Decode Node 1 +```shell +#!/bin/sh +export HCCL_IF_IP=decode_node_1_ip + +ifname="" + +export GLOO_SOCKET_IFNAME=${ifname} +export TP_SOCKET_IFNAME=${ifname} +export HCCL_SOCKET_IFNAME=${ifname} + +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=true +# To reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_BUFFSIZE=512 +export HCCL_OP_EXPANSION_MODE="AIV" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=2 +export TASK_QUEUE_ENABLE=1 + +vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--host 0.0.0.0 \ +--port 8000 \ +--tensor-parallel-size 4 \ +--data-parallel-size 8 \ +--data-parallel-size-local 4 \ +--data-parallel-start-rank 0 \ +--data-parallel-address decode_node_1_ip \ +--data-parallel-rpc-port decode_node_dp_port \ +--seed 1024 \ +--quantization ascend \ +--served-model-name qwen3 \ +--max-num-seqs 128 \ +--max-model-len 40960 \ +--max-num-batched-tokens 256 \ +--enable-expert-parallel \ +--trust-remote-code \ +--gpu-memory-utilization 0.9 \ +--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--async-scheduling \ +--kv-transfer-config \ +'{"kv_connector": "MooncakeConnector", +"kv_role": "kv_consumer", +"kv_port": "30100", +"engine_id": "1", +"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", +"kv_connector_extra_config": { + "use_ascend_direct": true, + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } +} +}' +``` +Decode Node 2 +```shell +#!/bin/sh +export HCCL_IF_IP=decode_node_2_ip + +ifname="" + +export GLOO_SOCKET_IFNAME=${ifname} +export TP_SOCKET_IFNAME=${ifname} +export HCCL_SOCKET_IFNAME=${ifname} + +# Load model from ModelScope to speed up download +export VLLM_USE_MODELSCOPE=true +# To reduce memory fragmentation and avoid out of memory +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True +export HCCL_BUFFSIZE=512 +export HCCL_OP_EXPANSION_MODE="AIV" +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=1 +export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 +export VLLM_ASCEND_ENABLE_FUSED_MC2=2 +export TASK_QUEUE_ENABLE=1 + +vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--host 0.0.0.0 \ +--port 8000 \ +--headless \ +--tensor-parallel-size 4 \ +--data-parallel-size 8 \ +--data-parallel-size-local 4 \ +--data-parallel-start-rank 4 \ +--data-parallel-address decode_node_1_ip \ +--data-parallel-rpc-port decode_node_dp_port \ +--seed 1024 \ +--quantization ascend \ +--served-model-name qwen3 \ +--max-num-seqs 128 \ +--max-model-len 40960 \ +--max-num-batched-tokens 256 \ +--enable-expert-parallel \ +--trust-remote-code \ +--gpu-memory-utilization 0.9 \ +--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ +--async-scheduling \ +--kv-transfer-config \ +'{"kv_connector": "MooncakeConnector", +"kv_role": "kv_consumer", +"kv_port": "30100", +"engine_id": "1", +"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", +"kv_connector_extra_config": { + "use_ascend_direct": true, + "prefill": { + "dp_size": 2, + "tp_size": 8 + }, + "decode": { + "dp_size": 8, + "tp_size": 4 + } +} +}' +``` +PD proxy: +``` +python load_balance_proxy_server_example.py --port 12347 --prefiller-hosts prefill_node_1_ip --prefiller-port 8000 --decoder-hosts decode_node_1_ip --decoder-ports 8000 +``` + +Benchmark scripts: +```shell +vllm bench serve --model qwen \ +--tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ +--ignore-eos \ +--dataset-name random \ +--random-input-len 3584 \ +--random-output-len 1536 \ +--num-prompts 800 \ +--max-concurrency 160 \ +--request-rate inf \ +--host 0.0.0.0 \ +--port 12347 \ +``` From a28e91d62f7e0ea08ec996247e903c908d0eec77 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Thu, 25 Dec 2025 16:23:53 +0800 Subject: [PATCH 2/8] modify Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 39 ++++++++++++++++++------ 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index ba985b41c1c..6f11d668b4f 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -119,14 +119,16 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --enable-expert-parallel \ --trust-remote-code \ --gpu-memory-utilization 0.95 \ ---rope_scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \ +--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling ``` **Notice:** -- for vllm version below `v0.12.0` use parameter: `--rope_scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \` -- for vllm version `v0.12.0` use parameter: `--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \` +- [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B#processing-long-texts) originally only supports 40960 context(max_position_embeddings). If you want to use it and its related quantization weights to run long seqs (such as 128k context), it is required to use yarn rope-scaling technique. + - For vLLM version same as or new than `v0.12.0`, use parameter: `--hf-overrides '{"rope_parameters": {"rope_type":"yarn","rope_theta":1000000,"factor":4,"original_max_position_embeddings":32768}}' \`. + - For vllm version below `v0.12.0`, use parameter: `--rope_scaling '{"rope_type":"yarn","factor":4,"original_max_position_embeddings":32768}' \`. + If you are using weights like [Qwen3-235B-A22B-Instruct-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Instruct-2507) which originally supports long contexts, there is no need to add this parameter. The parameters are explained as follows: - `--data-parallel-size` 1 and `--tensor-parallel-size` 8 are common settings for data parallelism (DP) and tensor parallelism (TP) sizes. @@ -313,13 +315,21 @@ vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random After about several minutes, you can get the performance evaluation result. -## Best Performance Tutorial +## Reproducing Performance Results -In this section, we provide simple scripts to re-produce our latest performance. +In this section, we provide simple scripts to re-produce our latest performance. It is also recommended to read instructions above to understand basic concepts or options in vLLM && vLLM-Ascend. -### Single Node A3 +### Environment -On a single Atlas 800 A3(64G*16)server, the recommended parallel setup is `--data-parallel-size 4` && `--tensor-parallel-size 4` && `--enable-expert-parallel `. Example server scripts: +- vLLM v0.13.0 +- vLLM-Ascend v0.13.0rc1 +- CANN 8.3.0 RC2 +- torch_npu 2.8.0 +- HDK/driver 25.3.RC1 + +### Single Node A3 (64G*16) + +Example server scripts: ```shell #!/bin/sh # Load model from ModelScope to speed up download @@ -348,6 +358,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --enable-expert-parallel \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ +--no-enable-prefix-caching \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling ``` @@ -362,16 +373,27 @@ vllm bench serve --model qwen \ --random-output-len 1536 \ --num-prompts 800 \ --max-concurrency 160 \ ---request-rate inf \ +--request-rate 24 \ --host 0.0.0.0 \ --port 8000 \ ``` +Reference test results: + +| num_requests | concurrency | mean TTFT(ms) | mean TPOT(ms) | output token throughput (tok/s) | +|----- | ----- | ----- | ----- | -----| +| 720 | 144 | 4717.45 | 48.69 | 2761.72 | + +Note: +1. Setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=1` enables MoE fused operators that reduce time consumption of MoE in both prefill and decode. This is an experimental feature which only supports W8A8 quantization now. +2. Here we disable prefix cache because of random datasets. You can enable prefix cache if requests have long common prefix. + ### Three Node A3 -- PD disaggregation On three Atlas 800 A3(64G*16)server, we recommend to use one node as one prefill instance and two nodes as one decode instance. Example server scripts: Prefill Node 1 ```shell +#!/bin/sh export HCCL_IF_IP=prefill_node_1_ip ifname="" @@ -380,7 +402,6 @@ export GLOO_SOCKET_IFNAME=${ifname} export TP_SOCKET_IFNAME=${ifname} export HCCL_SOCKET_IFNAME=${ifname} -#!/bin/sh # Load model from ModelScope to speed up download export VLLM_USE_MODELSCOPE=true # To reduce memory fragmentation and avoid out of memory From f59ed828ca8f2a190881a9b552aad3670df83587 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 26 Dec 2025 16:11:07 +0800 Subject: [PATCH 3/8] update Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 41 +++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 6f11d668b4f..70d5f0ac469 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -314,22 +314,31 @@ vllm bench serve --model vllm-ascend/Qwen3-235B-A22B-w8a8 --dataset-name random After about several minutes, you can get the performance evaluation result. - ## Reproducing Performance Results -In this section, we provide simple scripts to re-produce our latest performance. It is also recommended to read instructions above to understand basic concepts or options in vLLM && vLLM-Ascend. +In this section, we provide simple scripts to re-produce our latest performance. It is also recommended to read instructions above to understand basic concepts or options in vLLM && vLLM-Ascend. ### Environment - vLLM v0.13.0 - vLLM-Ascend v0.13.0rc1 -- CANN 8.3.0 RC2 +- CANN 8.3.RC2 - torch_npu 2.8.0 - HDK/driver 25.3.RC1 +- triton_ascend 3.2.0.dev2025110717 + +**Notice:** +triton_ascend is required for reproducing best performance of Qwen3-235B in vLLM-Ascend. If it is not installed in your environment, please follow the instructions bellow: + +```bash +wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl +pip install triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl +``` ### Single Node A3 (64G*16) Example server scripts: + ```shell #!/bin/sh # Load model from ModelScope to speed up download @@ -364,6 +373,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ ``` Benchmark scripts: + ```shell vllm bench serve --model qwen \ --tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ @@ -385,17 +395,19 @@ Reference test results: | 720 | 144 | 4717.45 | 48.69 | 2761.72 | Note: -1. Setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=1` enables MoE fused operators that reduce time consumption of MoE in both prefill and decode. This is an experimental feature which only supports W8A8 quantization now. +1. Setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=1` enables MoE fused operators that reduce time consumption of MoE in both prefill and decode. This is an experimental feature which only supports W8A8 quantization on Atlas A3 servers now. If you encounter any problems when using this feature, you can disable it by setting `export VLLM_ASCEND_ENABLE_FUSED_MC2=0` and update issues in vLLM-Ascend community. 2. Here we disable prefix cache because of random datasets. You can enable prefix cache if requests have long common prefix. ### Three Node A3 -- PD disaggregation On three Atlas 800 A3(64G*16)server, we recommend to use one node as one prefill instance and two nodes as one decode instance. Example server scripts: Prefill Node 1 + ```shell #!/bin/sh export HCCL_IF_IP=prefill_node_1_ip +# Set ifname according to your network setting ifname="" export GLOO_SOCKET_IFNAME=${ifname} @@ -417,16 +429,16 @@ export TASK_QUEUE_ENABLE=1 vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --host 0.0.0.0 \ --port 8000 \ ---tensor-parallel-size 4 \ ---data-parallel-size 8 \ ---data-parallel-size-local 4 \ +--tensor-parallel-size 8 \ +--data-parallel-size 2 \ +--data-parallel-size-local 8 \ --data-parallel-start-rank 0 \ --data-parallel-address prefill_node_1_ip \ --data-parallel-rpc-port prefill_node_dp_port \ --seed 1024 \ --quantization ascend \ --served-model-name qwen3 \ ---max-num-seqs 32 \ +--max-num-seqs 24 \ --max-model-len 40960 \ --max-num-batched-tokens 16384 \ --enable-expert-parallel \ @@ -454,7 +466,9 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ } }' ``` + Decode Node 1 + ```shell #!/bin/sh export HCCL_IF_IP=decode_node_1_ip @@ -516,7 +530,9 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ } }' ``` + Decode Node 2 + ```shell #!/bin/sh export HCCL_IF_IP=decode_node_2_ip @@ -579,12 +595,15 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ } }' ``` + PD proxy: + ``` python load_balance_proxy_server_example.py --port 12347 --prefiller-hosts prefill_node_1_ip --prefiller-port 8000 --decoder-hosts decode_node_1_ip --decoder-ports 8000 ``` Benchmark scripts: + ```shell vllm bench serve --model qwen \ --tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ @@ -598,3 +617,9 @@ vllm bench serve --model qwen \ --host 0.0.0.0 \ --port 12347 \ ``` + +Reference test results: + +| num_requests | concurrency | mean TTFT(ms) | mean TPOT(ms) | output token throughput (tok/s) | +|----- | ----- | ----- | ----- | -----| +| 2880 | 576 | 3735.98 | 52.07 | 8593.44 | From 098978563465413fb54999ad5cd461efdadb5ee3 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 26 Dec 2025 16:24:23 +0800 Subject: [PATCH 4/8] update Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 70d5f0ac469..796481e33eb 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -328,7 +328,7 @@ In this section, we provide simple scripts to re-produce our latest performance. - triton_ascend 3.2.0.dev2025110717 **Notice:** -triton_ascend is required for reproducing best performance of Qwen3-235B in vLLM-Ascend. If it is not installed in your environment, please follow the instructions bellow: +triton_ascend is required for reproducing best performance of Qwen3-235B in vLLM-Ascend. If it is not installed in your environment, please follow the instructions below: ```bash wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev2025110717-cp311-cp311-manylinux_2_27_aarch64.whl @@ -623,3 +623,6 @@ Reference test results: | num_requests | concurrency | mean TTFT(ms) | mean TPOT(ms) | output token throughput (tok/s) | |----- | ----- | ----- | ----- | -----| | 2880 | 576 | 3735.98 | 52.07 | 8593.44 | + +Note: +1. We recommend to set `export VLLM_ASCEND_ENABLE_FUSED_MC2=2` on this scenario (typically EP32 for Qwen3-235B). This enables a different MoE fusion operator. From aa962a2e19a8814d0181d2eab091e90148467201 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 26 Dec 2025 16:37:53 +0800 Subject: [PATCH 5/8] update Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 796481e33eb..2e2c36f75e9 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -611,8 +611,8 @@ vllm bench serve --model qwen \ --dataset-name random \ --random-input-len 3584 \ --random-output-len 1536 \ ---num-prompts 800 \ ---max-concurrency 160 \ +--num-prompts 2880 \ +--max-concurrency 576 \ --request-rate inf \ --host 0.0.0.0 \ --port 12347 \ From a910cf5f5a3843800be1d41911c3366b7841af9d Mon Sep 17 00:00:00 2001 From: Angazenn Date: Fri, 26 Dec 2025 16:51:13 +0800 Subject: [PATCH 6/8] update Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 2e2c36f75e9..70cb63479bf 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -375,7 +375,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ Benchmark scripts: ```shell -vllm bench serve --model qwen \ +vllm bench serve --model qwen3 \ --tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ --ignore-eos \ --dataset-name random \ @@ -425,13 +425,14 @@ export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_ENABLE_FUSED_MC2=2 export TASK_QUEUE_ENABLE=1 +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --host 0.0.0.0 \ --port 8000 \ --tensor-parallel-size 8 \ --data-parallel-size 2 \ ---data-parallel-size-local 8 \ +--data-parallel-size-local 2 \ --data-parallel-start-rank 0 \ --data-parallel-address prefill_node_1_ip \ --data-parallel-rpc-port prefill_node_dp_port \ @@ -445,10 +446,10 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --enforce-eager \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ ---compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ ---async-scheduling \ +--enforce-eager \ +--no-enable-prefix-caching \ --kv-transfer-config \ -'{"kv_connector": "MooncakeConnector", +'{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_producer", "kv_port": "30000", "engine_id": "0", @@ -490,6 +491,7 @@ export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_ENABLE_FUSED_MC2=2 export TASK_QUEUE_ENABLE=1 +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --host 0.0.0.0 \ @@ -554,6 +556,7 @@ export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 export VLLM_ASCEND_ENABLE_FUSED_MC2=2 export TASK_QUEUE_ENABLE=1 +export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages/mooncake:$LD_LIBRARY_PATH vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --host 0.0.0.0 \ @@ -605,7 +608,7 @@ python load_balance_proxy_server_example.py --port 12347 --prefiller-hosts prefi Benchmark scripts: ```shell -vllm bench serve --model qwen \ +vllm bench serve --model qwen3 \ --tokenizer vllm-ascend/Qwen3-235B-A22B-w8a8 \ --ignore-eos \ --dataset-name random \ From 2f9eaf682bb593e5bfd615bd55533b1eb77b75d5 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 27 Dec 2025 09:28:07 +0800 Subject: [PATCH 7/8] fix Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index 70cb63479bf..a3901eada99 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -514,7 +514,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling \ --kv-transfer-config \ -'{"kv_connector": "MooncakeConnector", +'{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30100", "engine_id": "1", @@ -580,7 +580,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling \ --kv-transfer-config \ -'{"kv_connector": "MooncakeConnector", +'{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", "kv_port": "30100", "engine_id": "1", From 0645e0d370c8839b416360ec02b283a9b879e2c1 Mon Sep 17 00:00:00 2001 From: Angazenn Date: Sat, 27 Dec 2025 14:25:11 +0800 Subject: [PATCH 8/8] fix Signed-off-by: Angazenn --- docs/source/tutorials/Qwen3-235B-A22B.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/source/tutorials/Qwen3-235B-A22B.md b/docs/source/tutorials/Qwen3-235B-A22B.md index a3901eada99..9b534c16704 100644 --- a/docs/source/tutorials/Qwen3-235B-A22B.md +++ b/docs/source/tutorials/Qwen3-235B-A22B.md @@ -99,7 +99,6 @@ export VLLM_USE_MODELSCOPE=true # To reduce memory fragmentation and avoid out of memory export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export HCCL_BUFFSIZE=512 -export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export VLLM_ASCEND_ENABLE_FLASHCOMM1=1 @@ -171,7 +170,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=1024 export TASK_QUEUE_ENABLE=1 -export HCCL_OP_EXPANSION_MODE="AIV" vllm serve vllm-ascend/Qwen3-235B-A22B \ --host 0.0.0.0 \ @@ -217,7 +215,6 @@ export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 export HCCL_BUFFSIZE=1024 export TASK_QUEUE_ENABLE=1 -export HCCL_OP_EXPANSION_MODE="AIV" vllm serve vllm-ascend/Qwen3-235B-A22B \ --host 0.0.0.0 \ @@ -484,7 +481,7 @@ export HCCL_SOCKET_IFNAME=${ifname} export VLLM_USE_MODELSCOPE=true # To reduce memory fragmentation and avoid out of memory export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export HCCL_BUFFSIZE=512 +export HCCL_BUFFSIZE=1024 export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 @@ -513,6 +510,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --gpu-memory-utilization 0.9 \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling \ +--no-enable-prefix-caching \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", @@ -549,7 +547,7 @@ export HCCL_SOCKET_IFNAME=${ifname} export VLLM_USE_MODELSCOPE=true # To reduce memory fragmentation and avoid out of memory export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export HCCL_BUFFSIZE=512 +export HCCL_BUFFSIZE=1024 export HCCL_OP_EXPANSION_MODE="AIV" export OMP_PROC_BIND=false export OMP_NUM_THREADS=1 @@ -579,6 +577,7 @@ vllm serve vllm-ascend/Qwen3-235B-A22B-w8a8 \ --gpu-memory-utilization 0.9 \ --compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}' \ --async-scheduling \ +--no-enable-prefix-caching \ --kv-transfer-config \ '{"kv_connector": "MooncakeConnectorV1", "kv_role": "kv_consumer", @@ -616,7 +615,7 @@ vllm bench serve --model qwen3 \ --random-output-len 1536 \ --num-prompts 2880 \ --max-concurrency 576 \ ---request-rate inf \ +--request-rate 8 \ --host 0.0.0.0 \ --port 12347 \ ```