diff --git a/docs/platforms/ascend/ascend_npu_best_practice.md b/docs/platforms/ascend/ascend_npu_best_practice.md index aba6d2012666..bcda94a2eb87 100644 --- a/docs/platforms/ascend/ascend_npu_best_practice.md +++ b/docs/platforms/ascend/ascend_npu_best_practice.md @@ -13,7 +13,7 @@ you encounter issues or have any questions, please [open an issue](https://githu | Deepseek-R1 | Atlas 800I A3 | 32 | PD Separation | 3.9K+1K | 20ms | W8A8 INT8 | [Optimal Configuration](#deepseek-r1-3_9k-1k-20ms-on-a3-32-cards-separation-mode) | | Deepseek-R1 | Atlas 800I A3 | 32 | PD Separation | 3.5K+1.5K | 20ms | W8A8 INT8 | [Optimal Configuration](#deepseek-r1-3_5k-1_5k-20ms-on-a3-32-cards-separation-mode) | | Deepseek-R1 | Atlas 800I A3 | 32 | PD Separation | 3.5K+1K | 20ms | W8A8 INT8 | [Optimal Configuration](#deepseek-r1-3_5k-1k-20ms-on-a3-32-cards-separation-mode) | -| DeepSeek-V3.2-Exp | Atlas 800I A3 | 32 | PD Separation | 64K+3K | 30ms | W8A8 INT8 | [Optimal Configuration](#deepseek-v32-exp-64k-3k-30ms-on-a3-32-cards-separation-mode) | +| DeepSeek-V3.2 | Atlas 800I A3 | 32 | PD Separation | 128K+1K | 20ms | W8A8 INT8 | [Optimal Configuration](#deepseek-v32-128k-1k-20ms-on-a3-32-cards-separation-mode) | ### High Throughput @@ -779,9 +779,9 @@ We tested it based on the `RANDOM` dataset. python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 384 --random-input-len 3500 --random-output-len 1500 --num-prompts 1536 --random-range-ratio 1 ``` -### DeepSeek-V3.2-Exp 64K-3K 30ms on A3 32 Cards Separation Mode +### DeepSeek-V3.2 128K-1K 20ms on A3 32 Cards Separation Mode -Model: DeepSeek-V3.2-Exp-W8A8 +Model: DeepSeek-V3.2-W8A8 Hardware: Atlas 800I A3 32Card @@ -789,14 +789,12 @@ DeployMode: PD Separation Dataset: random -Input Output Length: 64K+3K +Input Output Length: 128K+1K -TPOT: 30ms +TPOT: 20ms #### Model Deployment -Deploy Prefill Instance - ```shell echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor sysctl -w vm.swappiness=0 @@ -815,167 +813,117 @@ source /usr/local/Ascend/nnal/atb/set_env.sh export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH -export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest - export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export STREAMS_PER_DEVICE=32 +export ASCEND_MF_STORE_URL="tcp://your prefill ip1:24670" -export HCCL_BUFFSIZE=1024 -export DEEPEP_NORMAL_LONG_SEQ_ROUND=5 -export DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS=512 - +P_IP=('your prefill ip1' 'your prefill ip2') +D_IP=('your decode ip1' 'your decode ip2') MODEL_PATH=xxx -export SGLANG_NPU_USE_MLAPO=1 -export DEEP_NORMAL_MODE_USE_INT8_QUANT=1 -export SGLANG_NPU_USE_MULTI_STREAM=1 -export HCCL_OP_EXPANSION_MODE=AIV - -IPs=('your prefill ip1' 'your prefill ip2') +LOCAL_HOST1=`hostname -I|awk -F " " '{print$1}'` +LOCAL_HOST2=`hostname -I|awk -F " " '{print$2}'` +echo "${LOCAL_HOST1}" +echo "${LOCAL_HOST2}" -# get IP in current node -LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'` -echo "LOCAL_HOST = " ${LOCAL_HOST} -# get node index -for i in "${!IPs[@]}"; +# prefill +for i in "${!P_IP[@]}"; do - echo "LOCAL_HOST=${LOCAL_HOST}, IPs[${i}]=${IPs[$i]}" - if [ "$LOCAL_HOST" == "${IPs[$i]}" ]; then - echo "Node Rank : ${i}" - VC_TASK_INDEX=$i - break - fi -done - -IFNAMES=('xxx' 'xxx') - -export HCCL_SOCKET_IFNAME=${IFNAMES[$VC_TASK_INDEX]} -export GLOO_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME} -echo "HCCL_SOCKET_IFNAME : ${HCCL_SOCKET_IFNAME}" -nnodes=${#IPs[@]} -tp_size=`expr 16 \* ${nnodes}` -export ASCEND_MF_STORE_URL=tcp://${IPs[0]}:24667 - -python3 -m sglang.launch_server --model-path ${MODEL_PATH} \ ---tp $tp_size \ ---trust-remote-code \ ---attention-backend ascend \ ---device npu \ ---watchdog-timeout 9000 \ ---host ${IPs[$VC_TASK_INDEX]} --port 8000 \ ---mem-fraction-static 0.73 \ ---disable-radix-cache --chunked-prefill-size -1 --max-prefill-tokens 68000 \ ---max-running-requests 1 \ ---moe-a2a-backend deepep --deepep-mode normal \ ---quantization modelslim \ ---disaggregation-transfer-backend ascend \ ---disaggregation-mode prefill \ ---disable-cuda-graph \ ---nnodes $nnodes --node-rank $VC_TASK_INDEX \ ---disaggregation-bootstrap-port 8995 \ ---enable-nsa-prefill-context-parallel --moe-dense-tp-size 1 \ ---speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \ ---dist-init-addr ${IPs[0]}:10000 -``` - -Deploy Decode Instance - -```shell -echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor -sysctl -w vm.swappiness=0 -sysctl -w kernel.numa_balancing=0 -sysctl -w kernel.sched_migration_cost_ns=50000 - -export SGLANG_SET_CPU_AFFINITY=1 -unset https_proxy -unset http_proxy -unset HTTPS_PROXY -unset HTTP_PROXY -unset ASCEND_LAUNCH_BLOCKING -source /usr/local/Ascend/ascend-toolkit/set_env.sh -source /usr/local/Ascend/nnal/atb/set_env.sh -export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH} -export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH -export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest - -export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -export STREAMS_PER_DEVICE=32 - -MODEL_PATH=xxx + if [[ "$LOCAL_HOST1" == "${P_IP[$i]}" || "$LOCAL_HOST2" == "${P_IP[$i]}" ]]; + then + echo "${P_IP[$i]}" + export HCCL_BUFFSIZE=1200 + export DEEP_NORMAL_MODE_USE_INT8_QUANT=1 + export TASK_QUEUE_ENABLE=2 + export HCCL_SOCKET_IFNAME=xxx + export GLOO_SOCKET_IFNAME=xxx -export SGLANG_NPU_USE_MULTI_STREAM=1 -export SGLANG_NPU_USE_MLAPO=1 -export HCCL_OP_EXPANSION_MODE=AIV -export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1 -export TASK_QUEUE_ENABLE=0 -export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 -export SGLANG_ENABLE_SPEC_V2=1 + python3 -m sglang.launch_server --model-path ${MODEL_PATH} \ + --tp 32 \ + --trust-remote-code \ + --attention-backend ascend \ + --device npu \ + --watchdog-timeout 9000 \ + --host ${P_IP[$i]} --port 8000 \ + --mem-fraction-static 0.73 \ + --disable-radix-cache --chunked-prefill-size -1 --max-prefill-tokens 68000 \ + --max-running-requests 1 \ + --moe-a2a-backend deepep --deepep-mode normal \ + --quantization modelslim \ + --disaggregation-transfer-backend ascend \ + --disaggregation-mode prefill \ + --disable-cuda-graph \ + --nnodes 2 --node-rank $i \ + --disaggregation-bootstrap-port 8995 \ + --moe-dense-tp-size 1 \ + --enable-nsa-prefill-context-parallel \ + --nsa-prefill-cp-mode in-seq-split \ + --attn-cp-size 32 \ + --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \ + --dist-init-addr ${P_IP[0]}:10000 + break + fi +done -IPs=('your decode ip1' 'your decode ip2') -export prefill_ip=your prefill ip1 -# get IP in current node -LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'` -echo "LOCAL_HOST = " ${LOCAL_HOST} -# get node index -for i in "${!IPs[@]}"; +# decode +for i in "${!D_IP[@]}"; do - echo "LOCAL_HOST=${LOCAL_HOST}, IPs[${i}]=${IPs[$i]}" - if [ "$LOCAL_HOST" == "${IPs[$i]}" ]; then - echo "Node Rank : ${i}" - VC_TASK_INDEX=$i - break - fi -done - -IFNAMES=('xxx' 'xxx') + if [[ "$LOCAL_HOST1" == "${D_IP[$i]}" || "$LOCAL_HOST2" == "${D_IP[$i]}" ]]; + then + echo "${D_IP[$i]}" + export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1 + export SGLANG_ENABLE_SPEC_V2=1 -export HCCL_SOCKET_IFNAME=${IFNAMES[$VC_TASK_INDEX]} -export GLOO_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME} -nnodes=${#IPs[@]} -tp_size=`expr 16 \* ${nnodes}` -export ASCEND_MF_STORE_URL=tcp://${prefill_ip}:24667 + export TASK_QUEUE_ENABLE=0 + export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1 -CHUNKED_SIZE=65536 -DP=8 -export HCCL_BUFFSIZE=400 -export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=8 + export HCCL_SOCKET_IFNAME=xxx + export GLOO_SOCKET_IFNAME=xxx -python3 -m sglang.launch_server --model-path ${MODEL_PATH} \ ---tp $tp_size \ ---dp ${DP} \ ---ep $tp_size \ ---moe-dense-tp-size 1 \ ---enable-dp-attention \ ---enable-dp-lm-head \ ---trust-remote-code \ ---attention-backend ascend \ ---device npu \ ---watchdog-timeout 9000 \ ---host ${IPs[$VC_TASK_INDEX]} --port 8001 \ ---mem-fraction-static 0.79 \ ---disable-radix-cache \ ---chunked-prefill-size -1 --max-prefill-tokens 68000 \ ---max-running-requests 32 \ ---cuda-graph-max-bs 4 \ ---moe-a2a-backend deepep \ ---deepep-mode low_latency \ ---quantization modelslim \ ---speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \ ---disaggregation-transfer-backend ascend \ ---disaggregation-mode decode \ ---load-balance-method round_robin \ ---nnodes $nnodes --node-rank $VC_TASK_INDEX \ ---dist-init-addr ${IPs[0]}:10000 --load-balance-method decode_round_robin + DP=8 + export HCCL_BUFFSIZE=400 + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=8 + + python3 -m sglang.launch_server --model-path ${MODEL_PATH} \ + --tp 32 \ + --dp ${DP} \ + --ep 32 \ + --moe-dense-tp-size 1 \ + --enable-dp-attention \ + --enable-dp-lm-head \ + --trust-remote-code \ + --attention-backend ascend \ + --device npu \ + --watchdog-timeout 9000 \ + --host ${D_IP[$i]} --port 8001 \ + --mem-fraction-static 0.79 \ + --disable-radix-cache \ + --chunked-prefill-size -1 --max-prefill-tokens 68000 \ + --max-running-requests 32 \ + --cuda-graph-max-bs 4 \ + --moe-a2a-backend deepep \ + --deepep-mode low_latency \ + --quantization modelslim \ + --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \ + --disaggregation-transfer-backend ascend \ + --disaggregation-mode decode \ + --nnodes 2 --node-rank $i \ + --prefill-round-robin-balance \ + --dist-init-addr ${D_IP[0]}:10000 + break + fi +done ``` + ```shell export SGLANG_DP_ROUND_ROBIN=1 python -m sglang_router.launch_router \ --pd-disaggregation \ --policy cache_aware \ - --prefill http://PIP1:8000 8995 \ - --decode http://DIP1:8001 \ + --prefill http://P_IP1:8000 8995 \ + --decode http://D_IP1:8001 \ --host 127.0.0.1 \ --port 6688 \ --mini-lb @@ -986,7 +934,7 @@ python -m sglang_router.launch_router \ We tested it based on the `RANDOM` dataset. ```shell -python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 32 --random-input-len 64000 --random-output-len 3000 --num-prompts 64 --random-range-ratio 1 +python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 8 --random-input-len 131076 --random-output-len 1024 --num-prompts 8 --random-range-ratio 1 ``` ### Qwen3-235B-A22B 3_5K-1_5K 50ms on A3 24 Cards Separation Mode