diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml
index ea5c07ba9..8696732a4 100644
--- a/.github/workflows/pr-test-npu.yml
+++ b/.github/workflows/pr-test-npu.yml
@@ -55,30 +55,161 @@ jobs:
       - name: Run test intranode
         timeout-minutes: 10
         env:
-          HCCL_BUFFSIZE: 3000
+          HCCL_BUFFSIZE: 2300
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
 
+      - name: Run test intranode for little bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=4
+
+      - name: Run test intranode for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=8192
+
       - name: Run test multi-round intranode
         timeout-minutes: 10
         env:
           DEEPEP_NORMAL_LONG_SEQ_ROUND: 5
           DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS: 512
-          DEEPEP_NORMAL_COMBINE_ENABLE_LONG_SEQ: 1
           HCCL_BUFFSIZE: 1000
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2122
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2048
+
+      - name: Run test little processes intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2
+
+      - name: Run test hidden intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=7168
+
+      - name: Run test topk num intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16
+
+      - name: Run test intranode for active ranks
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --active-ranks="0,1,3"
+
+      - name: Run test intranode for DeepXtrace
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose
+
+      - name: Run test intranode for int8 quant
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          DEEP_NORMAL_MODE_USE_INT8_QUANT: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
+
+      - name: Run test intranode for output parameters of different types
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          MOE_EXPERT_TOKEN_NUMS_TYPE: 0
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
+
+      - name: Run test intranode for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens
 
       - name: Run test low latency
         timeout-minutes: 10
         env:
-          HCCL_BUFFSIZE: 3000
-          MOE_ENABLE_TOPK_NEG_ONE: 1
+          HCCL_BUFFSIZE: 1913
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2
 
+      - name: Run test low latency for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3825
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=512
+
+      - name: Run test low latency for little num processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2
+
+      - name: Run test low latency for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168
+
+      - name: Run test low latency for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1969
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16
+
+      - name: Run test low latency for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 7481
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024
+
+      - name: Run test low latency for drop percent
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3
+
+      - name: Run test low latency for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --enable-dynamic-tokens
+
       - name: Run test base fused deep moe
         timeout-minutes: 10
         env:
@@ -112,21 +243,99 @@ jobs:
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 3 --num-experts 16
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 4 --topk-drop-col 1 --num-experts 32
 
+      - name: Run test fused deepep moe for little processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-experts=24
+
+      - name: Run test fused deepep moe for bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens=256
+
+      - name: Run test fused deepep moe for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=7168
+
+      - name: Run test fused deepep moe for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=1
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=12
+
+      - name: Run test fused deepep moe for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-topk=1 --num-experts=4
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-experts=384
+
       - name: Run test mixed running
         timeout-minutes: 10
         env:
           HCCL_BUFFSIZE: 3000
-          MOE_ENABLE_TOPK_NEG_ONE: 1
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py
 
+      - name: Run test mixed running for little processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100
+
+      - name: Run test mixed running for bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8192 --low-latency-num-tokens=512 --test-loop=100
+
+      - name: Run test mixed running for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=7168 --test-loop=100
+
+      - name: Run test mixed running for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100
+
+      - name: Run test mixed running for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3769
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens
+
       - name: Run test generalization of fused deep moe
         timeout-minutes: 10
         env:
           HCCL_BUFFSIZE: 2048
         run: bash scripts/generalization_test_fused_deep_moe.sh
 
-  test-build-deepep:
+  test-build-deepep-a3:
     if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false
     runs-on: linux-aarch64-a3-16
@@ -163,20 +372,161 @@ jobs:
       - name: Run test intranode
         timeout-minutes: 10
         env:
-          HCCL_BUFFSIZE: 3000
+          HCCL_BUFFSIZE: 2300
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
 
+      - name: Run test intranode for little bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=4
+
+      - name: Run test intranode for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=8192
+
+      - name: Run test multi-round intranode
+        timeout-minutes: 10
+        env:
+          DEEPEP_NORMAL_LONG_SEQ_ROUND: 5
+          DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS: 512
+          HCCL_BUFFSIZE: 1000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2122
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2048
+
+      - name: Run test little processes intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2
+
+      - name: Run test hidden intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=7168
+
+      - name: Run test topk num intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16
+
+      - name: Run test intranode for active ranks
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --active-ranks="0,1,3"
+
+      - name: Run test intranode for DeepXtrace
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose
+
+      - name: Run test intranode for int8 quant
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          DEEP_NORMAL_MODE_USE_INT8_QUANT: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
+
+      - name: Run test intranode for output parameters of different types
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          MOE_EXPERT_TOKEN_NUMS_TYPE: 0
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py
+
+      - name: Run test intranode for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens
+
       - name: Run test low latency
         timeout-minutes: 10
         env:
-          HCCL_BUFFSIZE: 3000
-          MOE_ENABLE_TOPK_NEG_ONE: 1
+          HCCL_BUFFSIZE: 1913
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2
 
+      - name: Run test low latency for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3825
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=512
+
+      - name: Run test low latency for little num processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2
+
+      - name: Run test low latency for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168
+
+      - name: Run test low latency for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1969
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16
+
+      - name: Run test low latency for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 7481
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024
+
+      - name: Run test low latency for drop percent
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3
+
+      - name: Run test low latency for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --enable-dynamic-tokens
+
       - name: Run test base fused deep moe
         timeout-minutes: 10
         env:
@@ -210,25 +560,348 @@ jobs:
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 3 --num-experts 16
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 4 --topk-drop-col 1 --num-experts 32
 
+      - name: Run test fused deepep moe for little processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-experts=24
+
+      - name: Run test fused deepep moe for bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens=256
+
+      - name: Run test fused deepep moe for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=2048
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=7168
+
+      - name: Run test fused deepep moe for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=1
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=12
+
+      - name: Run test fused deepep moe for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-topk=1 --num-experts=4
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-experts=384
+
       - name: Run test mixed running
         timeout-minutes: 10
         env:
           HCCL_BUFFSIZE: 3000
-          MOE_ENABLE_TOPK_NEG_ONE: 1
         run: |
           python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py
 
+      - name: Run test mixed running for little processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100
+
+      - name: Run test mixed running for bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8192 --low-latency-num-tokens=512 --test-loop=100
+
+      - name: Run test mixed running for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2241
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=7168 --test-loop=100
+
+      - name: Run test mixed running for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100
+
+      - name: Run test mixed running for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3769
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens
+
       - name: Run test generalization of fused deep moe
         timeout-minutes: 10
         env:
           HCCL_BUFFSIZE: 2048
         run: bash scripts/generalization_test_fused_deep_moe.sh
 
+  test-build-deepep-a2:
+    if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') &&
+        github.event.pull_request.draft == false
+    runs-on: linux-aarch64-a2-8
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
+    steps:
+      - name: Clean git config
+        run: |
+          CONFIG_KEY='http.https://gh-proxy.test.osinfra.cn/.extraheader'
+          git config --global --unset "$CONFIG_KEY" || true
+
+      - name: Clean workspace
+        run: |
+          sudo rm -rf --one-file-system "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.* 2>/dev/null || true
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          clean: true
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.trusted-host ${CACHING_URL}
+
+          bash scripts/npu_ci_install_dependency.sh
+
+      - name: Prepare Deepep
+        run: bash scripts/prepare_deepep_in_container.sh -a deepep2
+
+      - name: Run test intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8
+
+      - name: Run test intranode for little bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=1 --num-processes=8
+
+      - name: Run test intranode for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4065
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=7168 --num-processes=8
+
+      - name: Run test little processes intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2
+
+      - name: Run test hidden intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144 --num-processes=8
+
+      - name: Run test topk num intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 4500
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16 --num-processes=8
+
+      - name: Run test experts num intranode
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2 --num-topk=1 --num-experts=2
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-experts=512 --num-processes=8
+
+      - name: Run test intranode for active ranks
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8 --active-ranks="0,1,3"
+
+      - name: Run test intranode for DeepXtrace
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose --num-processes=8
+
+      - name: Run test intranode for int8 quant
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+          DEEP_NORMAL_MODE_USE_INT8_QUANT: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8
+
+      - name: Run test intranode for output parameters of different types
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+          MOE_EXPERT_TOKEN_NUMS_TYPE: 0
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8
+
+      - name: Run test intranode for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens --num-processes=8
+
+      - name: Run test low latency
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2 --num-processes=8
+
+      - name: Run test low latency for big bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3825
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8 --num-tokens=512
+
+      - name: Run test low latency for little num processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2
+
+      - name: Run test low latency for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168 --num-processes=8
+
+      - name: Run test low latency for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1969
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16 --num-processes=8
+
+      - name: Run test low latency for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 7481
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024 --num-processes=8
+
+      - name: Run test low latency for drop percent
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 1913
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3 --num-processes=8
+
+      - name: Run test low latency for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 2300
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8 --enable-dynamic-tokens
+
+      - name: Run test mixed running
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=8
+
+      - name: Run test mixed running for little processes
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100
+
+      - name: Run test mixed running for bs
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 5000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8000 --low-latency-num-tokens=488 --test-loop=100 --num-processes=8
+
+      - name: Run test mixed running for hidden
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100 --num-processes=8
+
+      - name: Run test mixed running for topk
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 5000
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100 --num-processes=8
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100 --num-processes=8
+
+      - name: Run test mixed running for experts
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3769
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2 --test-loop=100
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-experts=512 --test-loop=100 --num-processes=8
+
+      - name: Run test mixed running for dynamic tokens
+        timeout-minutes: 10
+        env:
+          HCCL_BUFFSIZE: 3769
+          MOE_ENABLE_TOPK_NEG_ONE: 1
+        run: |
+          python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens --num-processes=8
+
   finish:
     if: always()
     needs:
       - test-all-build
-      - test-build-deepep
+      - test-build-deepep-a3
+      - test-build-deepep-a2
     runs-on: ubuntu-latest
     steps:
       - name: Check all dependent job statuses
diff --git a/tests/python/deepep/test_internode.py b/tests/python/deepep/test_internode.py
index 5c519fa6e..182956949 100644
--- a/tests/python/deepep/test_internode.py
+++ b/tests/python/deepep/test_internode.py
@@ -38,24 +38,28 @@ def test_main(
     base_num_tokens, hidden = args.num_tokens, args.hidden
     num_topk, num_experts = args.num_topk, args.num_experts
     enable_diagnose = args.enable_diagnose
+    enable_dynamic_tokens = args.enable_dynamic_tokens
     num_servers = num_ranks // num_local_ranks
     num_nodes = num_servers
     expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1))
 
-    fluctuation_percentage = 0.1
-    min_fluctuation = 2
+    if enable_dynamic_tokens:
+        fluctuation_percentage = 0.1
+        min_fluctuation = 2
 
-    if base_num_tokens < 10:
-        fluctuation = random.randint(-min_fluctuation, min_fluctuation)
-        num_tokens = base_num_tokens + fluctuation
-    else:
-        fluctuation = random.uniform(
-            1 - fluctuation_percentage, 1 + fluctuation_percentage
-        )
-        num_tokens = int(base_num_tokens * fluctuation)
+        if base_num_tokens < 10:
+            fluctuation = random.randint(-min_fluctuation, min_fluctuation)
+            num_tokens = base_num_tokens + fluctuation
+        else:
+            fluctuation = random.uniform(
+                1 - fluctuation_percentage, 1 + fluctuation_percentage
+            )
+            num_tokens = int(base_num_tokens * fluctuation)
 
-    # Ensure num_tokens is at least 1
-    num_tokens = max(num_tokens, 1)
+        # Ensure num_tokens is at least 1
+        num_tokens = max(num_tokens, 1)
+    else:
+        num_tokens = base_num_tokens
 
     assert num_experts % num_ranks == 0 and num_nodes >= 2
     assert num_tokens <= MAX_BATCH_SIZE
@@ -659,6 +663,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         default=-1,
         help="If >=0, drop this specific top-k column (set index to -1 for testing).",
     )
+    parser.add_argument(
+        "--enable-dynamic-tokens",
+        action="store_true",
+        help="Whether to enable dynamic tokens for testing",
+    )
     args = parser.parse_args()
 
     num_processes = args.num_processes
diff --git a/tests/python/deepep/test_intranode.py b/tests/python/deepep/test_intranode.py
index 265f99384..2e0830b25 100644
--- a/tests/python/deepep/test_intranode.py
+++ b/tests/python/deepep/test_intranode.py
@@ -34,23 +34,27 @@ def test_main(
     base_num_tokens, hidden = args.num_tokens, args.hidden
     num_topk, num_experts = args.num_topk, args.num_experts
     enable_diagnose = args.enable_diagnose
+    enable_dynamic_tokens = args.enable_dynamic_tokens
     num_servers = num_ranks // num_local_ranks
     expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1))
 
-    fluctuation_percentage = 0.1
-    min_fluctuation = 2
+    if enable_dynamic_tokens:
+        fluctuation_percentage = 0.1
+        min_fluctuation = 2
 
-    if base_num_tokens < 10:
-        fluctuation = random.randint(-min_fluctuation, min_fluctuation)
-        num_tokens = base_num_tokens + fluctuation
-    else:
-        fluctuation = random.uniform(
-            1 - fluctuation_percentage, 1 + fluctuation_percentage
-        )
-        num_tokens = int(base_num_tokens * fluctuation)
+        if base_num_tokens < 10:
+            fluctuation = random.randint(-min_fluctuation, min_fluctuation)
+            num_tokens = base_num_tokens + fluctuation
+        else:
+            fluctuation = random.uniform(
+                1 - fluctuation_percentage, 1 + fluctuation_percentage
+            )
+            num_tokens = int(base_num_tokens * fluctuation)
 
-    # Ensure num_tokens is at least 1
-    num_tokens = max(num_tokens, 1)
+        # Ensure num_tokens is at least 1
+        num_tokens = max(num_tokens, 1)
+    else:
+        num_tokens = base_num_tokens
 
     assert num_experts % num_ranks == 0
     if local_rank == 0:
@@ -543,6 +547,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         default=-1,
         help="If >=0, drop this specific top-k column (set index to -1 for testing).",
     )
+    parser.add_argument(
+        "--enable-dynamic-tokens",
+        action="store_true",
+        help="Whether to enable dynamic tokens for testing",
+    )
     args = parser.parse_args()
 
     num_processes = args.num_processes
diff --git a/tests/python/deepep/test_low_latency.py b/tests/python/deepep/test_low_latency.py
index b2403cfa9..aa5e3e2c1 100644
--- a/tests/python/deepep/test_low_latency.py
+++ b/tests/python/deepep/test_low_latency.py
@@ -340,22 +340,28 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     use_experts = num_experts if shared_expert_rank_num == 0 else (num_experts - 1)
     use_ranks = num_ranks - shared_expert_rank_num
     drop_percent = args.drop_percent
+    enable_dynamic_tokens = args.enable_dynamic_tokens
 
-    fluctuation_percentage = 0.1
-    min_fluctuation = 2
+    if enable_dynamic_tokens:
+        fluctuation_percentage = 0.1
+        min_fluctuation = 2
 
-    if base_num_tokens < 10:
-        fluctuation = random.randint(-min_fluctuation, min_fluctuation)
-        num_tokens = base_num_tokens + fluctuation
-    else:
-        fluctuation = random.uniform(
-            1 - fluctuation_percentage, 1 + fluctuation_percentage
-        )
-        num_tokens = int(base_num_tokens * fluctuation)
+        if base_num_tokens < 10:
+            fluctuation = random.randint(-min_fluctuation, min_fluctuation)
+            num_tokens = base_num_tokens + fluctuation
+        else:
+            fluctuation = random.uniform(
+                1 - fluctuation_percentage, 1 + fluctuation_percentage
+            )
+            num_tokens = int(base_num_tokens * fluctuation)
 
-    raw_num_tokens = max(num_tokens, 1)
+        raw_num_tokens = max(num_tokens, 1)
+    else:
+        raw_num_tokens = base_num_tokens
 
-    local_tokens_tensor = torch.tensor([num_tokens], dtype=torch.int32, device="npu")
+    local_tokens_tensor = torch.tensor(
+        [raw_num_tokens], dtype=torch.int32, device="npu"
+    )
     dist.all_reduce(local_tokens_tensor, op=dist.ReduceOp.MAX)
     aligned_num_tokens = local_tokens_tensor.item()
 
@@ -454,6 +460,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         default=0.0,
         help="Percentage of dropping an individual top-k index (set to -1). ",
     )
+    parser.add_argument(
+        "--enable-dynamic-tokens",
+        action="store_true",
+        help="Whether to enable dynamic tokens for testing",
+    )
     args = parser.parse_args()
 
     num_processes = args.num_processes
diff --git a/tests/python/deepep/test_normal_and_low_latency.py b/tests/python/deepep/test_normal_and_low_latency.py
index 3f42444f5..545fe3752 100644
--- a/tests/python/deepep/test_normal_and_low_latency.py
+++ b/tests/python/deepep/test_normal_and_low_latency.py
@@ -135,6 +135,7 @@ def low_latency_test(
 def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
     rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
     num_topk, num_experts, hidden = args.num_topk, args.num_experts, args.hidden
+    enable_dynamic_tokens = args.enable_dynamic_tokens
     assert num_experts % num_ranks == 0
     torch.manual_seed(rank)
 
@@ -146,17 +147,20 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         fluctuation_percentage = 0.1
         min_fluctuation = 2
 
-        if base_normal_num_tokens < 10:
-            fluctuation = random.randint(-min_fluctuation, min_fluctuation)
-            normal_num_tokens = base_normal_num_tokens + fluctuation
+        if enable_dynamic_tokens:
+            if base_normal_num_tokens < 10:
+                fluctuation = random.randint(-min_fluctuation, min_fluctuation)
+                normal_num_tokens = base_normal_num_tokens + fluctuation
+            else:
+                fluctuation = random.uniform(
+                    1 - fluctuation_percentage, 1 + fluctuation_percentage
+                )
+                normal_num_tokens = int(base_normal_num_tokens * fluctuation)
+
+            # Ensure normal_num_tokens is at least 1
+            normal_num_tokens = max(normal_num_tokens, 1)
         else:
-            fluctuation = random.uniform(
-                1 - fluctuation_percentage, 1 + fluctuation_percentage
-            )
-            normal_num_tokens = int(base_normal_num_tokens * fluctuation)
-
-        # Ensure normal_num_tokens is at least 1
-        normal_num_tokens = max(normal_num_tokens, 1)
+            normal_num_tokens = base_normal_num_tokens
 
         if local_rank == 0:
             print(f"Start executing normal test loop {i} ...", flush=True)
@@ -172,17 +176,20 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
 
         base_low_latency_num_tokens = args.low_latency_num_tokens
 
-        if base_low_latency_num_tokens < 10:
-            fluctuation = random.randint(-min_fluctuation, min_fluctuation)
-            low_latency_num_tokens = base_low_latency_num_tokens + fluctuation
+        if enable_dynamic_tokens:
+            if base_low_latency_num_tokens < 10:
+                fluctuation = random.randint(-min_fluctuation, min_fluctuation)
+                low_latency_num_tokens = base_low_latency_num_tokens + fluctuation
+            else:
+                fluctuation = random.uniform(
+                    1 - fluctuation_percentage, 1 + fluctuation_percentage
+                )
+                low_latency_num_tokens = int(base_low_latency_num_tokens * fluctuation)
+
+            # Ensure low_latency_num_tokens is at least 1
+            low_latency_num_tokens = max(low_latency_num_tokens, 1)
         else:
-            fluctuation = random.uniform(
-                1 - fluctuation_percentage, 1 + fluctuation_percentage
-            )
-            low_latency_num_tokens = int(base_low_latency_num_tokens * fluctuation)
-
-        # Ensure low_latency_num_tokens is at least 1
-        low_latency_num_tokens = max(low_latency_num_tokens, 1)
+            low_latency_num_tokens = base_low_latency_num_tokens
 
         local_tokens_tensor = torch.tensor(
             [low_latency_num_tokens], dtype=torch.int32, device="npu"
@@ -246,6 +253,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
         default=1000,
         help="Number of test loop (default: 1000)",
     )
+    parser.add_argument(
+        "--enable-dynamic-tokens",
+        action="store_true",
+        help="Whether to enable dynamic tokens for testing",
+    )
 
     args = parser.parse_args()