sgl-project · sglang-npu-bot · Mar 30, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/docs/platforms/ascend/ascend_npu_best_practice.md b/docs/platforms/ascend/ascend_npu_best_practice.md
@@ -13,7 +13,7 @@ you encounter issues or have any questions, please [open an issue](https://githu
 | Deepseek-R1       | Atlas 800I A3 | 32    | PD Separation | 3.9K+1K   | 20ms | W8A8 INT8    | [Optimal Configuration](#deepseek-r1-3_9k-1k-20ms-on-a3-32-cards-separation-mode)     |
 | Deepseek-R1       | Atlas 800I A3 | 32    | PD Separation | 3.5K+1.5K | 20ms | W8A8 INT8    | [Optimal Configuration](#deepseek-r1-3_5k-1_5k-20ms-on-a3-32-cards-separation-mode)   |
 | Deepseek-R1       | Atlas 800I A3 | 32    | PD Separation | 3.5K+1K   | 20ms | W8A8 INT8    | [Optimal Configuration](#deepseek-r1-3_5k-1k-20ms-on-a3-32-cards-separation-mode)     |
-| DeepSeek-V3.2-Exp | Atlas 800I A3 | 32    | PD Separation | 64K+3K    | 30ms | W8A8 INT8    | [Optimal Configuration](#deepseek-v32-exp-64k-3k-30ms-on-a3-32-cards-separation-mode) |
+| DeepSeek-V3.2     | Atlas 800I A3 | 32    | PD Separation | 128K+1K   | 20ms | W8A8 INT8    | [Optimal Configuration](#deepseek-v32-128k-1k-20ms-on-a3-32-cards-separation-mode) |
 
 ### High Throughput
 
@@ -779,24 +779,22 @@ We tested it based on the `RANDOM` dataset.
 python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 384  --random-input-len 3500 --random-output-len 1500 --num-prompts 1536 --random-range-ratio 1
 ```
 
-### DeepSeek-V3.2-Exp 64K-3K 30ms on A3 32 Cards Separation Mode
+### DeepSeek-V3.2 128K-1K 20ms on A3 32 Cards Separation Mode
 
-Model: DeepSeek-V3.2-Exp-W8A8
+Model: DeepSeek-V3.2-W8A8
 
 Hardware: Atlas 800I A3 32Card
 
 DeployMode: PD Separation
 
 Dataset: random
 
-Input Output Length: 64K+3K
+Input Output Length: 128K+1K
 
-TPOT: 30ms
+TPOT: 20ms
 
 #### Model Deployment
 
-Deploy Prefill Instance
-
 ```shell
 echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
 sysctl -w vm.swappiness=0
@@ -815,167 +813,117 @@ source /usr/local/Ascend/nnal/atb/set_env.sh
 export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
 export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH
 
-export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
-
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export STREAMS_PER_DEVICE=32
+export ASCEND_MF_STORE_URL="tcp://your prefill ip1:24670"
 
-export HCCL_BUFFSIZE=1024
-export DEEPEP_NORMAL_LONG_SEQ_ROUND=5
-export DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS=512
-
+P_IP=('your prefill ip1' 'your prefill ip2')
+D_IP=('your decode ip1' 'your decode ip2')
 MODEL_PATH=xxx
 
-export SGLANG_NPU_USE_MLAPO=1
-export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
-export SGLANG_NPU_USE_MULTI_STREAM=1
-export HCCL_OP_EXPANSION_MODE=AIV
-
-IPs=('your prefill ip1' 'your prefill ip2')
+LOCAL_HOST1=`hostname -I|awk -F " " '{print$1}'`
+LOCAL_HOST2=`hostname -I|awk -F " " '{print$2}'`
+echo "${LOCAL_HOST1}"
+echo "${LOCAL_HOST2}"
 
-# get IP in current node
-LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
-echo "LOCAL_HOST = " ${LOCAL_HOST}
-# get node index
-for i in "${!IPs[@]}";
+# prefill
+for i in "${!P_IP[@]}";
 do
-  echo "LOCAL_HOST=${LOCAL_HOST}, IPs[${i}]=${IPs[$i]}"
-  if [ "$LOCAL_HOST" == "${IPs[$i]}" ]; then
-      echo "Node Rank : ${i}"
-      VC_TASK_INDEX=$i
-      break
-  fi
-done
-
-IFNAMES=('xxx' 'xxx')
-
-export HCCL_SOCKET_IFNAME=${IFNAMES[$VC_TASK_INDEX]}
-export GLOO_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME}
-echo "HCCL_SOCKET_IFNAME : ${HCCL_SOCKET_IFNAME}"
-nnodes=${#IPs[@]}
-tp_size=`expr 16 \* ${nnodes}`
-export ASCEND_MF_STORE_URL=tcp://${IPs[0]}:24667
-
-python3 -m sglang.launch_server --model-path ${MODEL_PATH} \
---tp $tp_size \
---trust-remote-code \
---attention-backend ascend \
---device npu \
---watchdog-timeout 9000 \
---host ${IPs[$VC_TASK_INDEX]} --port 8000 \
---mem-fraction-static 0.73 \
---disable-radix-cache --chunked-prefill-size -1 --max-prefill-tokens 68000 \
---max-running-requests 1 \
---moe-a2a-backend deepep --deepep-mode normal \
---quantization modelslim \
---disaggregation-transfer-backend ascend \
---disaggregation-mode prefill \
---disable-cuda-graph \
---nnodes $nnodes --node-rank $VC_TASK_INDEX \
---disaggregation-bootstrap-port 8995 \
---enable-nsa-prefill-context-parallel  --moe-dense-tp-size 1 \
---speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \
---dist-init-addr ${IPs[0]}:10000
-```
-
-Deploy Decode Instance
-
-```shell
-echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
-sysctl -w vm.swappiness=0
-sysctl -w kernel.numa_balancing=0
-sysctl -w kernel.sched_migration_cost_ns=50000
-
-export SGLANG_SET_CPU_AFFINITY=1
-unset https_proxy
-unset http_proxy
-unset HTTPS_PROXY
-unset HTTP_PROXY
-unset ASCEND_LAUNCH_BLOCKING
-source /usr/local/Ascend/ascend-toolkit/set_env.sh
-source /usr/local/Ascend/nnal/atb/set_env.sh
-export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/vendors/customize/op_api/lib/:${LD_LIBRARY_PATH}
-export PATH=/usr/local/Ascend/8.5.0/compiler/bishengir/bin:$PATH
-export ASCEND_HOME_PATH=/usr/local/Ascend/ascend-toolkit/latest
-
-export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
-export STREAMS_PER_DEVICE=32
-
-MODEL_PATH=xxx
+    if [[ "$LOCAL_HOST1" == "${P_IP[$i]}" || "$LOCAL_HOST2" == "${P_IP[$i]}" ]];
+    then
+        echo "${P_IP[$i]}"
+        export HCCL_BUFFSIZE=1200
+        export DEEP_NORMAL_MODE_USE_INT8_QUANT=1
+        export TASK_QUEUE_ENABLE=2
+        export HCCL_SOCKET_IFNAME=xxx
+        export GLOO_SOCKET_IFNAME=xxx
 
-export SGLANG_NPU_USE_MULTI_STREAM=1
-export SGLANG_NPU_USE_MLAPO=1
-export HCCL_OP_EXPANSION_MODE=AIV
-export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1
-export TASK_QUEUE_ENABLE=0
-export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
-export SGLANG_ENABLE_SPEC_V2=1
+        python3 -m sglang.launch_server --model-path ${MODEL_PATH} \
+        --tp 32 \
+        --trust-remote-code \
+        --attention-backend ascend \
+        --device npu \
+        --watchdog-timeout 9000 \
+        --host ${P_IP[$i]} --port 8000 \
+        --mem-fraction-static 0.73 \
+        --disable-radix-cache --chunked-prefill-size -1 --max-prefill-tokens 68000 \
+        --max-running-requests 1 \
+        --moe-a2a-backend deepep --deepep-mode normal \
+        --quantization modelslim \
+        --disaggregation-transfer-backend ascend \
+        --disaggregation-mode prefill \
+        --disable-cuda-graph \
+        --nnodes 2 --node-rank $i \
+        --disaggregation-bootstrap-port 8995 \
+        --moe-dense-tp-size 1 \
+	    --enable-nsa-prefill-context-parallel \
+        --nsa-prefill-cp-mode in-seq-split \
+        --attn-cp-size 32 \
+        --speculative-algorithm NEXTN --speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2 \
+        --dist-init-addr ${P_IP[0]}:10000
+        break
+    fi
+done
 
-IPs=('your decode ip1' 'your decode ip2')
 
-export prefill_ip=your prefill ip1
-# get IP in current node
-LOCAL_HOST=`hostname -I|awk -F " " '{print$1}'`
-echo "LOCAL_HOST = " ${LOCAL_HOST}
-# get node index
-for i in "${!IPs[@]}";
+# decode
+for i in "${!D_IP[@]}";
 do
-  echo "LOCAL_HOST=${LOCAL_HOST}, IPs[${i}]=${IPs[$i]}"
-  if [ "$LOCAL_HOST" == "${IPs[$i]}" ]; then
-      echo "Node Rank : ${i}"
-      VC_TASK_INDEX=$i
-      break
-  fi
-done
-
-IFNAMES=('xxx' 'xxx')
+    if [[ "$LOCAL_HOST1" == "${D_IP[$i]}" || "$LOCAL_HOST2" == "${D_IP[$i]}" ]];
+    then
+        echo "${D_IP[$i]}"
+        export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
+        export SGLANG_ENABLE_SPEC_V2=1
 
-export HCCL_SOCKET_IFNAME=${IFNAMES[$VC_TASK_INDEX]}
-export GLOO_SOCKET_IFNAME=${HCCL_SOCKET_IFNAME}
-nnodes=${#IPs[@]}
-tp_size=`expr 16 \* ${nnodes}`
-export ASCEND_MF_STORE_URL=tcp://${prefill_ip}:24667
+        export TASK_QUEUE_ENABLE=0
+        export SGLANG_SCHEDULER_SKIP_ALL_GATHER=1
 
-CHUNKED_SIZE=65536
-DP=8
-export HCCL_BUFFSIZE=400
-export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=8
+        export HCCL_SOCKET_IFNAME=xxx
+        export GLOO_SOCKET_IFNAME=xxx
 
-python3 -m sglang.launch_server --model-path ${MODEL_PATH} \
---tp $tp_size \
---dp ${DP} \
---ep $tp_size \
---moe-dense-tp-size 1 \
---enable-dp-attention \
---enable-dp-lm-head \
---trust-remote-code \
---attention-backend ascend \
---device npu \
---watchdog-timeout 9000 \
---host ${IPs[$VC_TASK_INDEX]} --port 8001 \
---mem-fraction-static 0.79 \
---disable-radix-cache \
---chunked-prefill-size -1 --max-prefill-tokens 68000 \
---max-running-requests 32 \
---cuda-graph-max-bs 4 \
---moe-a2a-backend deepep \
---deepep-mode low_latency \
---quantization modelslim \
---speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
---disaggregation-transfer-backend ascend \
---disaggregation-mode decode \
---load-balance-method round_robin \
---nnodes $nnodes --node-rank $VC_TASK_INDEX \
---dist-init-addr ${IPs[0]}:10000 --load-balance-method decode_round_robin
+        DP=8
+        export HCCL_BUFFSIZE=400
+        export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=8
+
+        python3 -m sglang.launch_server --model-path ${MODEL_PATH} \
+        --tp 32 \
+        --dp ${DP} \
+        --ep 32 \
+        --moe-dense-tp-size 1 \
+        --enable-dp-attention \
+        --enable-dp-lm-head \
+        --trust-remote-code \
+        --attention-backend ascend \
+        --device npu \
+        --watchdog-timeout 9000 \
+        --host ${D_IP[$i]} --port 8001 \
+        --mem-fraction-static 0.79 \
+        --disable-radix-cache \
+        --chunked-prefill-size -1 --max-prefill-tokens 68000 \
+        --max-running-requests 32 \
+        --cuda-graph-max-bs 4 \
+        --moe-a2a-backend deepep \
+        --deepep-mode low_latency \
+        --quantization modelslim \
+        --speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 \
+        --disaggregation-transfer-backend ascend \
+        --disaggregation-mode decode \
+        --nnodes 2 --node-rank $i \
+        --prefill-round-robin-balance \
+        --dist-init-addr ${D_IP[0]}:10000
+        break
+    fi
+done
 ```
 
+
 ```shell
 export SGLANG_DP_ROUND_ROBIN=1
 python -m sglang_router.launch_router \
     --pd-disaggregation \
     --policy cache_aware \
-    --prefill http://PIP1:8000 8995 \
-    --decode http://DIP1:8001 \
+    --prefill http://P_IP1:8000 8995 \
+    --decode http://D_IP1:8001 \
     --host 127.0.0.1 \
     --port 6688 \
     --mini-lb
@@ -986,7 +934,7 @@ python -m sglang_router.launch_router \
 We tested it based on the `RANDOM` dataset.
 
 ```shell
-python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 32  --random-input-len 64000 --random-output-len 3000 --num-prompts 64 --random-range-ratio 1
+python -m sglang.bench_serving --dataset-name random --backend sglang --host 127.0.0.1 --port 6688 --max-concurrency 8  --random-input-len 131076 --random-output-len 1024 --num-prompts 8 --random-range-ratio 1
 ```
 
 ### Qwen3-235B-A22B 3_5K-1_5K 50ms on A3 24 Cards Separation Mode