diff --git a/.github/workflows/pr-test-npu.yml b/.github/workflows/pr-test-npu.yml index ea5c07ba9..8696732a4 100644 --- a/.github/workflows/pr-test-npu.yml +++ b/.github/workflows/pr-test-npu.yml @@ -55,30 +55,161 @@ jobs: - name: Run test intranode timeout-minutes: 10 env: - HCCL_BUFFSIZE: 3000 + HCCL_BUFFSIZE: 2300 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + - name: Run test intranode for little bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=4 + + - name: Run test intranode for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=8192 + - name: Run test multi-round intranode timeout-minutes: 10 env: DEEPEP_NORMAL_LONG_SEQ_ROUND: 5 DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS: 512 - DEEPEP_NORMAL_COMBINE_ENABLE_LONG_SEQ: 1 HCCL_BUFFSIZE: 1000 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2122 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2048 + + - name: Run test little processes intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2 + + - name: Run test hidden intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=7168 + + - name: Run test topk num intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16 + + - name: Run test intranode for active ranks + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --active-ranks="0,1,3" + + - name: Run test intranode for DeepXtrace + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose + + - name: Run test intranode for int8 quant + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + DEEP_NORMAL_MODE_USE_INT8_QUANT: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + + - name: Run test intranode for output parameters of different types + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + MOE_EXPERT_TOKEN_NUMS_TYPE: 0 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + + - name: Run test intranode for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens - name: Run test low latency timeout-minutes: 10 env: - HCCL_BUFFSIZE: 3000 - MOE_ENABLE_TOPK_NEG_ONE: 1 + HCCL_BUFFSIZE: 1913 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1 python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2 + - name: Run test low latency for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3825 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=512 + + - name: Run test low latency for little num processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 + + - name: Run test low latency for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168 + + - name: Run test low latency for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1969 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16 + + - name: Run test low latency for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 7481 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024 + + - name: Run test low latency for drop percent + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3 + + - name: Run test low latency for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --enable-dynamic-tokens + - name: Run test base fused deep moe timeout-minutes: 10 env: @@ -112,21 +243,99 @@ jobs: python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 3 --num-experts 16 python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 4 --topk-drop-col 1 --num-experts 32 + - name: Run test fused deepep moe for little processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-experts=24 + + - name: Run test fused deepep moe for bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens=256 + + - name: Run test fused deepep moe for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=7168 + + - name: Run test fused deepep moe for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=1 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=12 + + - name: Run test fused deepep moe for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-topk=1 --num-experts=4 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-experts=384 + - name: Run test mixed running timeout-minutes: 10 env: HCCL_BUFFSIZE: 3000 - MOE_ENABLE_TOPK_NEG_ONE: 1 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py + - name: Run test mixed running for little processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100 + + - name: Run test mixed running for bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8192 --low-latency-num-tokens=512 --test-loop=100 + + - name: Run test mixed running for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=7168 --test-loop=100 + + - name: Run test mixed running for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100 + + - name: Run test mixed running for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3769 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens + - name: Run test generalization of fused deep moe timeout-minutes: 10 env: HCCL_BUFFSIZE: 2048 run: bash scripts/generalization_test_fused_deep_moe.sh - test-build-deepep: + test-build-deepep-a3: if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: linux-aarch64-a3-16 @@ -163,20 +372,161 @@ jobs: - name: Run test intranode timeout-minutes: 10 env: - HCCL_BUFFSIZE: 3000 + HCCL_BUFFSIZE: 2300 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + - name: Run test intranode for little bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=4 + + - name: Run test intranode for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=8192 + + - name: Run test multi-round intranode + timeout-minutes: 10 + env: + DEEPEP_NORMAL_LONG_SEQ_ROUND: 5 + DEEPEP_NORMAL_LONG_SEQ_PER_ROUND_TOKENS: 512 + HCCL_BUFFSIZE: 1000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2122 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=2048 + + - name: Run test little processes intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2 + + - name: Run test hidden intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=7168 + + - name: Run test topk num intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16 + + - name: Run test intranode for active ranks + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --active-ranks="0,1,3" + + - name: Run test intranode for DeepXtrace + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose + + - name: Run test intranode for int8 quant + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + DEEP_NORMAL_MODE_USE_INT8_QUANT: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + + - name: Run test intranode for output parameters of different types + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + MOE_EXPERT_TOKEN_NUMS_TYPE: 0 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py + + - name: Run test intranode for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens + - name: Run test low latency timeout-minutes: 10 env: - HCCL_BUFFSIZE: 3000 - MOE_ENABLE_TOPK_NEG_ONE: 1 + HCCL_BUFFSIZE: 1913 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1 python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2 + - name: Run test low latency for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3825 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=512 + + - name: Run test low latency for little num processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 + + - name: Run test low latency for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168 + + - name: Run test low latency for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1969 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16 + + - name: Run test low latency for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 7481 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024 + + - name: Run test low latency for drop percent + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3 + + - name: Run test low latency for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --enable-dynamic-tokens + - name: Run test base fused deep moe timeout-minutes: 10 env: @@ -210,25 +560,348 @@ jobs: python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 2 --topk-drop-col 3 --num-experts 16 python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens 4 --topk-drop-col 1 --num-experts 32 + - name: Run test fused deepep moe for little processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-experts=24 + + - name: Run test fused deepep moe for bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-tokens=256 + + - name: Run test fused deepep moe for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=2048 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --hidden=7168 + + - name: Run test fused deepep moe for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=1 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-topk=12 + + - name: Run test fused deepep moe for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-processes=2 --num-topk=1 --num-experts=4 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_fused_deep_moe.py --num-experts=384 + - name: Run test mixed running timeout-minutes: 10 env: HCCL_BUFFSIZE: 3000 - MOE_ENABLE_TOPK_NEG_ONE: 1 run: | python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py + - name: Run test mixed running for little processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100 + + - name: Run test mixed running for bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8192 --low-latency-num-tokens=512 --test-loop=100 + + - name: Run test mixed running for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2241 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=7168 --test-loop=100 + + - name: Run test mixed running for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100 + + - name: Run test mixed running for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3769 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens + - name: Run test generalization of fused deep moe timeout-minutes: 10 env: HCCL_BUFFSIZE: 2048 run: bash scripts/generalization_test_fused_deep_moe.sh + test-build-deepep-a2: + if: (github.repository == 'sgl-project/sgl-kernel-npu' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: linux-aarch64-a2-8 + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 + steps: + - name: Clean git config + run: | + CONFIG_KEY='http.https://gh-proxy.test.osinfra.cn/.extraheader' + git config --global --unset "$CONFIG_KEY" || true + + - name: Clean workspace + run: | + sudo rm -rf --one-file-system "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.* 2>/dev/null || true + + - name: Checkout code + uses: actions/checkout@v4 + with: + clean: true + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.trusted-host ${CACHING_URL} + + bash scripts/npu_ci_install_dependency.sh + + - name: Prepare Deepep + run: bash scripts/prepare_deepep_in_container.sh -a deepep2 + + - name: Run test intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8 + + - name: Run test intranode for little bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=1 --num-processes=8 + + - name: Run test intranode for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4065 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-tokens=7168 --num-processes=8 + + - name: Run test little processes intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2 + + - name: Run test hidden intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=2048 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=4096 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --hidden=6144 --num-processes=8 + + - name: Run test topk num intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 4500 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=1 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-topk=16 --num-processes=8 + + - name: Run test experts num intranode + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=2 --num-topk=1 --num-experts=2 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-experts=512 --num-processes=8 + + - name: Run test intranode for active ranks + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8 --active-ranks="0,1,3" + + - name: Run test intranode for DeepXtrace + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-diagnose --num-processes=8 + + - name: Run test intranode for int8 quant + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + DEEP_NORMAL_MODE_USE_INT8_QUANT: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8 + + - name: Run test intranode for output parameters of different types + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + MOE_EXPERT_TOKEN_NUMS_TYPE: 0 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --num-processes=8 + + - name: Run test intranode for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_intranode.py --enable-dynamic-tokens --num-processes=8 + + - name: Run test low latency + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=1 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-tokens=2 --num-processes=8 + + - name: Run test low latency for big bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3825 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8 --num-tokens=512 + + - name: Run test low latency for little num processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 + + - name: Run test low latency for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=2048 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=4096 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=6144 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --hidden=7168 --num-processes=8 + + - name: Run test low latency for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1969 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=4 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-topk=16 --num-processes=8 + + - name: Run test low latency for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 7481 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-experts=1024 --num-processes=8 + + - name: Run test low latency for drop percent + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 1913 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --drop-percent=0.3 --num-processes=8 + + - name: Run test low latency for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 2300 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_low_latency.py --num-processes=8 --enable-dynamic-tokens + + - name: Run test mixed running + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=8 + + - name: Run test mixed running for little processes + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --test-loop=100 + + - name: Run test mixed running for bs + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 5000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=1 --low-latency-num-tokens=1 --test-loop=100 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --normal-num-tokens=8000 --low-latency-num-tokens=488 --test-loop=100 --num-processes=8 + + - name: Run test mixed running for hidden + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=2048 --test-loop=100 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=4096 --test-loop=100 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --hidden=6144 --test-loop=100 --num-processes=8 + + - name: Run test mixed running for topk + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 5000 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=1 --test-loop=100 --num-processes=8 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-topk=16 --test-loop=100 --num-processes=8 + + - name: Run test mixed running for experts + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3769 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-processes=2 --num-topk=1 --num-experts=2 --test-loop=100 + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --num-experts=512 --test-loop=100 --num-processes=8 + + - name: Run test mixed running for dynamic tokens + timeout-minutes: 10 + env: + HCCL_BUFFSIZE: 3769 + MOE_ENABLE_TOPK_NEG_ONE: 1 + run: | + python3 $GITHUB_WORKSPACE/tests/python/deepep/test_normal_and_low_latency.py --test-loop=100 --enable-dynamic-tokens --num-processes=8 + finish: if: always() needs: - test-all-build - - test-build-deepep + - test-build-deepep-a3 + - test-build-deepep-a2 runs-on: ubuntu-latest steps: - name: Check all dependent job statuses diff --git a/tests/python/deepep/test_internode.py b/tests/python/deepep/test_internode.py index 5c519fa6e..182956949 100644 --- a/tests/python/deepep/test_internode.py +++ b/tests/python/deepep/test_internode.py @@ -38,24 +38,28 @@ def test_main( base_num_tokens, hidden = args.num_tokens, args.hidden num_topk, num_experts = args.num_topk, args.num_experts enable_diagnose = args.enable_diagnose + enable_dynamic_tokens = args.enable_dynamic_tokens num_servers = num_ranks // num_local_ranks num_nodes = num_servers expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1)) - fluctuation_percentage = 0.1 - min_fluctuation = 2 + if enable_dynamic_tokens: + fluctuation_percentage = 0.1 + min_fluctuation = 2 - if base_num_tokens < 10: - fluctuation = random.randint(-min_fluctuation, min_fluctuation) - num_tokens = base_num_tokens + fluctuation - else: - fluctuation = random.uniform( - 1 - fluctuation_percentage, 1 + fluctuation_percentage - ) - num_tokens = int(base_num_tokens * fluctuation) + if base_num_tokens < 10: + fluctuation = random.randint(-min_fluctuation, min_fluctuation) + num_tokens = base_num_tokens + fluctuation + else: + fluctuation = random.uniform( + 1 - fluctuation_percentage, 1 + fluctuation_percentage + ) + num_tokens = int(base_num_tokens * fluctuation) - # Ensure num_tokens is at least 1 - num_tokens = max(num_tokens, 1) + # Ensure num_tokens is at least 1 + num_tokens = max(num_tokens, 1) + else: + num_tokens = base_num_tokens assert num_experts % num_ranks == 0 and num_nodes >= 2 assert num_tokens <= MAX_BATCH_SIZE @@ -659,6 +663,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): default=-1, help="If >=0, drop this specific top-k column (set index to -1 for testing).", ) + parser.add_argument( + "--enable-dynamic-tokens", + action="store_true", + help="Whether to enable dynamic tokens for testing", + ) args = parser.parse_args() num_processes = args.num_processes diff --git a/tests/python/deepep/test_intranode.py b/tests/python/deepep/test_intranode.py index 265f99384..2e0830b25 100644 --- a/tests/python/deepep/test_intranode.py +++ b/tests/python/deepep/test_intranode.py @@ -34,23 +34,27 @@ def test_main( base_num_tokens, hidden = args.num_tokens, args.hidden num_topk, num_experts = args.num_topk, args.num_experts enable_diagnose = args.enable_diagnose + enable_dynamic_tokens = args.enable_dynamic_tokens num_servers = num_ranks // num_local_ranks expert_token_nums_type = int(os.getenv("MOE_EXPERT_TOKEN_NUMS_TYPE", 1)) - fluctuation_percentage = 0.1 - min_fluctuation = 2 + if enable_dynamic_tokens: + fluctuation_percentage = 0.1 + min_fluctuation = 2 - if base_num_tokens < 10: - fluctuation = random.randint(-min_fluctuation, min_fluctuation) - num_tokens = base_num_tokens + fluctuation - else: - fluctuation = random.uniform( - 1 - fluctuation_percentage, 1 + fluctuation_percentage - ) - num_tokens = int(base_num_tokens * fluctuation) + if base_num_tokens < 10: + fluctuation = random.randint(-min_fluctuation, min_fluctuation) + num_tokens = base_num_tokens + fluctuation + else: + fluctuation = random.uniform( + 1 - fluctuation_percentage, 1 + fluctuation_percentage + ) + num_tokens = int(base_num_tokens * fluctuation) - # Ensure num_tokens is at least 1 - num_tokens = max(num_tokens, 1) + # Ensure num_tokens is at least 1 + num_tokens = max(num_tokens, 1) + else: + num_tokens = base_num_tokens assert num_experts % num_ranks == 0 if local_rank == 0: @@ -543,6 +547,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): default=-1, help="If >=0, drop this specific top-k column (set index to -1 for testing).", ) + parser.add_argument( + "--enable-dynamic-tokens", + action="store_true", + help="Whether to enable dynamic tokens for testing", + ) args = parser.parse_args() num_processes = args.num_processes diff --git a/tests/python/deepep/test_low_latency.py b/tests/python/deepep/test_low_latency.py index b2403cfa9..aa5e3e2c1 100644 --- a/tests/python/deepep/test_low_latency.py +++ b/tests/python/deepep/test_low_latency.py @@ -340,22 +340,28 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): use_experts = num_experts if shared_expert_rank_num == 0 else (num_experts - 1) use_ranks = num_ranks - shared_expert_rank_num drop_percent = args.drop_percent + enable_dynamic_tokens = args.enable_dynamic_tokens - fluctuation_percentage = 0.1 - min_fluctuation = 2 + if enable_dynamic_tokens: + fluctuation_percentage = 0.1 + min_fluctuation = 2 - if base_num_tokens < 10: - fluctuation = random.randint(-min_fluctuation, min_fluctuation) - num_tokens = base_num_tokens + fluctuation - else: - fluctuation = random.uniform( - 1 - fluctuation_percentage, 1 + fluctuation_percentage - ) - num_tokens = int(base_num_tokens * fluctuation) + if base_num_tokens < 10: + fluctuation = random.randint(-min_fluctuation, min_fluctuation) + num_tokens = base_num_tokens + fluctuation + else: + fluctuation = random.uniform( + 1 - fluctuation_percentage, 1 + fluctuation_percentage + ) + num_tokens = int(base_num_tokens * fluctuation) - raw_num_tokens = max(num_tokens, 1) + raw_num_tokens = max(num_tokens, 1) + else: + raw_num_tokens = base_num_tokens - local_tokens_tensor = torch.tensor([num_tokens], dtype=torch.int32, device="npu") + local_tokens_tensor = torch.tensor( + [raw_num_tokens], dtype=torch.int32, device="npu" + ) dist.all_reduce(local_tokens_tensor, op=dist.ReduceOp.MAX) aligned_num_tokens = local_tokens_tensor.item() @@ -454,6 +460,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): default=0.0, help="Percentage of dropping an individual top-k index (set to -1). ", ) + parser.add_argument( + "--enable-dynamic-tokens", + action="store_true", + help="Whether to enable dynamic tokens for testing", + ) args = parser.parse_args() num_processes = args.num_processes diff --git a/tests/python/deepep/test_normal_and_low_latency.py b/tests/python/deepep/test_normal_and_low_latency.py index 3f42444f5..545fe3752 100644 --- a/tests/python/deepep/test_normal_and_low_latency.py +++ b/tests/python/deepep/test_normal_and_low_latency.py @@ -135,6 +135,7 @@ def low_latency_test( def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): rank, num_ranks, group = init_dist(local_rank, num_local_ranks) num_topk, num_experts, hidden = args.num_topk, args.num_experts, args.hidden + enable_dynamic_tokens = args.enable_dynamic_tokens assert num_experts % num_ranks == 0 torch.manual_seed(rank) @@ -146,17 +147,20 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): fluctuation_percentage = 0.1 min_fluctuation = 2 - if base_normal_num_tokens < 10: - fluctuation = random.randint(-min_fluctuation, min_fluctuation) - normal_num_tokens = base_normal_num_tokens + fluctuation + if enable_dynamic_tokens: + if base_normal_num_tokens < 10: + fluctuation = random.randint(-min_fluctuation, min_fluctuation) + normal_num_tokens = base_normal_num_tokens + fluctuation + else: + fluctuation = random.uniform( + 1 - fluctuation_percentage, 1 + fluctuation_percentage + ) + normal_num_tokens = int(base_normal_num_tokens * fluctuation) + + # Ensure normal_num_tokens is at least 1 + normal_num_tokens = max(normal_num_tokens, 1) else: - fluctuation = random.uniform( - 1 - fluctuation_percentage, 1 + fluctuation_percentage - ) - normal_num_tokens = int(base_normal_num_tokens * fluctuation) - - # Ensure normal_num_tokens is at least 1 - normal_num_tokens = max(normal_num_tokens, 1) + normal_num_tokens = base_normal_num_tokens if local_rank == 0: print(f"Start executing normal test loop {i} ...", flush=True) @@ -172,17 +176,20 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): base_low_latency_num_tokens = args.low_latency_num_tokens - if base_low_latency_num_tokens < 10: - fluctuation = random.randint(-min_fluctuation, min_fluctuation) - low_latency_num_tokens = base_low_latency_num_tokens + fluctuation + if enable_dynamic_tokens: + if base_low_latency_num_tokens < 10: + fluctuation = random.randint(-min_fluctuation, min_fluctuation) + low_latency_num_tokens = base_low_latency_num_tokens + fluctuation + else: + fluctuation = random.uniform( + 1 - fluctuation_percentage, 1 + fluctuation_percentage + ) + low_latency_num_tokens = int(base_low_latency_num_tokens * fluctuation) + + # Ensure low_latency_num_tokens is at least 1 + low_latency_num_tokens = max(low_latency_num_tokens, 1) else: - fluctuation = random.uniform( - 1 - fluctuation_percentage, 1 + fluctuation_percentage - ) - low_latency_num_tokens = int(base_low_latency_num_tokens * fluctuation) - - # Ensure low_latency_num_tokens is at least 1 - low_latency_num_tokens = max(low_latency_num_tokens, 1) + low_latency_num_tokens = base_low_latency_num_tokens local_tokens_tensor = torch.tensor( [low_latency_num_tokens], dtype=torch.int32, device="npu" @@ -246,6 +253,11 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): default=1000, help="Number of test loop (default: 1000)", ) + parser.add_argument( + "--enable-dynamic-tokens", + action="store_true", + help="Whether to enable dynamic tokens for testing", + ) args = parser.parse_args()