diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 5b53e2ae255c..55c00d532349 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -713,23 +713,23 @@ jobs: - name: Benchmark single latency timeout-minutes: 20 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default - name: Benchmark online latency timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default - name: Benchmark offline throughput timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default - name: Benchmark offline throughput (Non-streaming, small batch size) timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size performance-test-1-gpu-part-2-amd: needs: [check-changes, stage-a-test-1-amd] @@ -768,17 +768,17 @@ jobs: - name: Benchmark offline throughput (w/o RadixAttention) timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache - name: Benchmark offline throughput (w/ Triton) timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend - name: Benchmark offline throughput (w/ FP8) timeout-minutes: 15 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8 performance-test-2-gpu-amd: needs: [check-changes, stage-a-test-1-amd] @@ -822,32 +822,32 @@ jobs: - name: Benchmark single latency (TP=2) timeout-minutes: 25 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 25 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1 - name: Benchmark offline throughput (TP=2) timeout-minutes: 25 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) timeout-minutes: 25 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache - name: Benchmark offline PP decode throughput (PP=2) timeout-minutes: 10 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode - name: Benchmark offline PP prefill throughput (PP=2) timeout-minutes: 10 run: | - bash scripts/ci/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill accuracy-test-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] @@ -886,7 +886,7 @@ jobs: - name: Evaluate Accuracy timeout-minutes: 30 run: | - bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/eval -e SGLANG_USE_AITER=0 python3 test_eval_accuracy_large.py accuracy-test-2-gpu-amd: needs: [check-changes, accuracy-test-1-gpu-amd] @@ -926,7 +926,7 @@ jobs: - name: Evaluate accuracy (TP=2) timeout-minutes: 30 run: | - bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 test_moe_eval_accuracy_large.py + bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/eval -e SGLANG_USE_AITER_AR=0 -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 test_moe_eval_accuracy_large.py pr-test-amd-finish: needs: diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 1d71b283ec08..2441ac428764 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -186,6 +186,7 @@ jobs: needs: [check-changes, call-gate] if: needs.check-changes.outputs.sgl_kernel == 'true' runs-on: x64-kernel-build-node + timeout-minutes: 60 strategy: matrix: include: @@ -233,6 +234,7 @@ jobs: needs: [check-changes, call-gate] if: needs.check-changes.outputs.sgl_kernel == 'true' runs-on: arm-kernel-build-node + timeout-minutes: 60 strategy: matrix: include: @@ -283,6 +285,7 @@ jobs: !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-runner steps: @@ -319,6 +322,7 @@ jobs: !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-runner steps: @@ -355,6 +359,7 @@ jobs: !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner + timeout-minutes: 60 env: CI: true RUNNER_LABELS: 1-gpu-runner @@ -404,6 +409,7 @@ jobs: !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: ${{ needs.check-changes.outputs.b200_runner }} + timeout-minutes: 60 env: RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} steps: @@ -473,6 +479,7 @@ jobs: !inputs.target_stage && needs.check-changes.outputs.jit_kernel == 'true' runs-on: 1-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-runner steps: @@ -506,6 +513,7 @@ jobs: ) ) runs-on: 1-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-runner steps: @@ -552,6 +560,7 @@ jobs: ) ) runs-on: ubuntu-latest + timeout-minutes: 60 steps: - name: Free disk space run: | @@ -597,6 +606,7 @@ jobs: ) ) runs-on: 1-gpu-5090 + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-5090 IS_BLACKWELL: "1" @@ -650,6 +660,7 @@ jobs: ) ) runs-on: 1-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 1-gpu-runner strategy: @@ -699,6 +710,7 @@ jobs: ) ) runs-on: 2-gpu-runner + timeout-minutes: 60 env: RUNNER_LABELS: 2-gpu-runner strategy: @@ -734,24 +746,23 @@ jobs: fi python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG - stage-b-test-4-gpu-b200: + stage-b-test-small-1-gpu-performance: needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'stage-b-test-4-gpu-b200') || + (inputs.target_stage == 'stage-b-test-small-1-gpu-performance') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: ${{ needs.check-changes.outputs.b200_runner }} + runs-on: 1-gpu-5090 + timeout-minutes: 60 env: - RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} - strategy: - fail-fast: false - + RUNNER_LABELS: 1-gpu-5090 + IS_BLACKWELL: "1" steps: - name: Checkout code uses: actions/checkout@v4 @@ -760,7 +771,7 @@ jobs: - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true @@ -769,33 +780,40 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh + source /etc/profile.d/sglang-ci.sh + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | - cd test + source /etc/profile.d/sglang-ci.sh + cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-performance $CONTINUE_ON_ERROR_FLAG - stage-c-test-large-4-gpu: - needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels] + stage-b-test-large-1-gpu-performance: + needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-4-gpu') || + (inputs.target_stage == 'stage-b-test-large-1-gpu-performance') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 4-gpu-h100 + runs-on: 1-gpu-runner + timeout-minutes: 60 env: - RUNNER_LABELS: 4-gpu-h100 + RUNNER_LABELS: 1-gpu-runner + strategy: + fail-fast: false + matrix: + partition: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 @@ -816,30 +834,31 @@ jobs: CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - name: Run test - timeout-minutes: 30 + timeout-minutes: 40 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu-performance --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG - stage-c-test-large-4-gpu-b200: - needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels] + stage-b-test-large-2-gpu-performance: + needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'stage-c-test-large-4-gpu-b200') || + (inputs.target_stage == 'stage-b-test-large-2-gpu-performance') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: ${{ needs.check-changes.outputs.b200_runner }} + runs-on: 2-gpu-runner + timeout-minutes: 60 env: - RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} + RUNNER_LABELS: 2-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 @@ -848,7 +867,7 @@ jobs: - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v6 + uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true @@ -857,81 +876,35 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd test/ - IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200 - - multimodal-gen-test-1-gpu: - needs: [check-changes, call-gate, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'multimodal-gen-test-1-gpu') || - ( - !inputs.target_stage && - (github.event_name == 'schedule' || (!failure() && !cancelled())) && - needs.check-changes.outputs.multimodal_gen == 'true' - ) - ) - runs-on: 1-gpu-runner - strategy: - fail-fast: false - matrix: - part: [0, 1] - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion - - name: Run diffusion server tests - timeout-minutes: 60 - run: | - cd python CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 sglang/multimodal_gen/test/run_suite.py \ - --suite 1-gpu \ - --partition-id ${{ matrix.part }} \ - --total-partitions 2 \ - $CONTINUE_ON_ERROR_FLAG - + python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-performance $CONTINUE_ON_ERROR_FLAG - multimodal-gen-test-2-gpu: - needs: [check-changes, call-gate, sgl-kernel-build-wheels] + stage-b-test-small-1-gpu-accuracy: + needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'multimodal-gen-test-2-gpu') || + (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && - needs.check-changes.outputs.multimodal_gen == 'true' + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 2-gpu-runner - strategy: - fail-fast: false - matrix: - part: [0, 1] + runs-on: 1-gpu-5090 + timeout-minutes: 60 + env: + RUNNER_LABELS: 1-gpu-5090 + IS_BLACKWELL: "1" steps: - name: Checkout code uses: actions/checkout@v4 @@ -949,87 +922,39 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion + source /etc/profile.d/sglang-ci.sh + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . - - name: Run diffusion server tests - timeout-minutes: 60 + - name: Run test + timeout-minutes: 25 run: | - cd python + source /etc/profile.d/sglang-ci.sh + cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 sglang/multimodal_gen/test/run_suite.py \ - --suite 2-gpu \ - --partition-id ${{ matrix.part }} \ - --total-partitions 2 \ - $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-accuracy $CONTINUE_ON_ERROR_FLAG - quantization-test: - needs: [check-changes, call-gate, stage-a-test-1] - if: | - always() && - ( - (inputs.target_stage == 'quantization-test') || - ( - !inputs.target_stage && - (github.event_name == 'schedule' || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - RETRY_FLAG="" - if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then - RETRY_FLAG="--enable-retry" - fi - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --suite quantization_test $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG - - unit-test-backend-4-gpu: - needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] + stage-b-test-large-2-gpu-accuracy: + needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'unit-test-backend-4-gpu') || + (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 4-gpu-h100 + runs-on: 2-gpu-runner + timeout-minutes: 60 env: - RUNNER_LABELS: 4-gpu-h100 - strategy: - fail-fast: false - matrix: - part: [0, 1, 2] + RUNNER_LABELS: 2-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 @@ -1048,40 +973,39 @@ jobs: timeout-minutes: 10 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . - name: Run test - timeout-minutes: 20 + timeout-minutes: 25 run: | - cd test/srt - RETRY_FLAG="" - if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then - RETRY_FLAG="--enable-retry" - fi + cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-accuracy $CONTINUE_ON_ERROR_FLAG - unit-test-backend-8-gpu-h200: - needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] + stage-b-test-4-gpu-b200: + needs: [check-changes, call-gate, stage-a-test-1, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'unit-test-backend-8-gpu-h200') || + (inputs.target_stage == 'stage-b-test-4-gpu-b200') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 8-gpu-h200 + runs-on: ${{ needs.check-changes.outputs.b200_runner }} + timeout-minutes: 60 env: - RUNNER_LABELS: 8-gpu-h200 + RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} strategy: fail-fast: false - matrix: - part: [0, 1, 2, 3] + steps: - name: Checkout code uses: actions/checkout@v4 @@ -1090,7 +1014,7 @@ jobs: - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v6 with: path: sgl-kernel/dist/ merge-multiple: true @@ -1099,48 +1023,34 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - # - name: Warmup Weights and JIT Compilation - # timeout-minutes: 20 - # run: | - # # An example command for testing the warmup. TODO: make this more general and move them to python scripts. - # python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh - name: Run test - timeout-minutes: 20 + timeout-minutes: 30 run: | - cd test/srt - RETRY_FLAG="" - if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then - RETRY_FLAG="--enable-retry" - fi + cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG + IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG - unit-test-backend-8-gpu-h20: - needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] + stage-c-test-large-4-gpu: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'unit-test-backend-8-gpu-h20') || + (inputs.target_stage == 'stage-c-test-large-4-gpu') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 8-gpu-h20 + runs-on: 4-gpu-h100 + timeout-minutes: 60 env: - SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" - RUNNER_LABELS: 8-gpu-h20 - strategy: - fail-fast: false - matrix: - part: [0, 1] + RUNNER_LABELS: 4-gpu-h100 steps: - name: Checkout code uses: actions/checkout@v4 @@ -1158,37 +1068,34 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - name: Run test - timeout-minutes: 20 + timeout-minutes: 30 run: | - cd test/srt - RETRY_FLAG="" - if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then - RETRY_FLAG="--enable-retry" - fi + cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu $CONTINUE_ON_ERROR_FLAG - performance-test-1-gpu-part-1: - needs: [check-changes, call-gate, stage-a-test-1] + stage-c-test-large-4-gpu-b200: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'performance-test-1-gpu-part-1') || + (inputs.target_stage == 'stage-c-test-large-4-gpu-b200') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 1-gpu-runner + runs-on: ${{ needs.check-changes.outputs.b200_runner }} + timeout-minutes: 60 env: - RUNNER_LABELS: 1-gpu-runner + RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -1197,7 +1104,7 @@ jobs: - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v6 with: path: sgl-kernel/dist/ merge-multiple: true @@ -1206,61 +1113,32 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - - name: Benchmark single latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default - - - name: Benchmark online latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default - - - name: Benchmark offline throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default - - - name: Benchmark offline throughput (Non-streaming, small batch size) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size - - - name: Benchmark online latency (EAGLE) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh - - name: Benchmark online latency (LoRA) - timeout-minutes: 10 + - name: Run test + timeout-minutes: 30 run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency - python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates + cd test/ + IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-large-4-gpu-b200 - performance-test-1-gpu-part-2: - needs: [check-changes, call-gate, stage-a-test-1] + multimodal-gen-test-1-gpu: + needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'performance-test-1-gpu-part-2') || + (inputs.target_stage == 'multimodal-gen-test-1-gpu') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + needs.check-changes.outputs.multimodal_gen == 'true' ) ) runs-on: 1-gpu-runner - env: - RUNNER_LABELS: 1-gpu-runner + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1278,53 +1156,40 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - - name: Benchmark offline throughput (w/o RadixAttention) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache - - - name: Benchmark offline throughput (w/ Triton) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend - - - name: Benchmark offline throughput (w/ FP8) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 - - - name: Benchmark VLM offline throughput - timeout-minutes: 10 + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion + - name: Run diffusion server tests + timeout-minutes: 60 run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput + cd python + CONTINUE_ON_ERROR_FLAG="" + if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + CONTINUE_ON_ERROR_FLAG="--continue-on-error" + fi + python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 1-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + $CONTINUE_ON_ERROR_FLAG - - name: Benchmark VLM online latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency - performance-test-1-gpu-part-3: - needs: [check-changes, call-gate, stage-a-test-1] + multimodal-gen-test-2-gpu: + needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | always() && ( - (inputs.target_stage == 'performance-test-1-gpu-part-3') || + (inputs.target_stage == 'multimodal-gen-test-2-gpu') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + needs.check-changes.outputs.multimodal_gen == 'true' ) ) - runs-on: 1-gpu-runner - env: - RUNNER_LABELS: 1-gpu-runner + runs-on: 2-gpu-runner + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1342,47 +1207,42 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - - name: Benchmark Scores online latency and throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput - - - name: Benchmark Scores online latency and throughput (batch size scaling) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling - - - name: Benchmark Embeddings online latency and throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_latency_throughput + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh diffusion - - name: Benchmark Embeddings online latency and throughput (batch size scaling) - timeout-minutes: 10 + - name: Run diffusion server tests + timeout-minutes: 60 run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_embeddings_api_batch_scaling + cd python + CONTINUE_ON_ERROR_FLAG="" + if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + CONTINUE_ON_ERROR_FLAG="--continue-on-error" + fi + python3 sglang/multimodal_gen/test/run_suite.py \ + --suite 2-gpu \ + --partition-id ${{ matrix.part }} \ + --total-partitions 2 \ + $CONTINUE_ON_ERROR_FLAG - performance-test-2-gpu: - needs: [check-changes, call-gate, stage-b-test-4-gpu-b200] + unit-test-backend-4-gpu: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] if: | always() && ( - (inputs.target_stage == 'performance-test-2-gpu') || + (inputs.target_stage == 'unit-test-backend-4-gpu') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 2-gpu-runner + runs-on: 4-gpu-h100 + timeout-minutes: 60 env: - RUNNER_LABELS: 2-gpu-runner + RUNNER_LABELS: 4-gpu-h100 + strategy: + fail-fast: false + matrix: + part: [0, 1, 2] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1402,57 +1262,40 @@ jobs: run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - - name: Benchmark single latency (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - - - name: Benchmark single latency + torch.compile (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 - - - name: Benchmark offline throughput (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default - - - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache - - - name: Benchmark offline PP decode throughput (PP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode - - - name: Benchmark offline PP prefill throughput (PP=2) - timeout-minutes: 10 + - name: Run test + timeout-minutes: 20 run: | cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill + RETRY_FLAG="" + if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then + RETRY_FLAG="--enable-retry" + fi + CONTINUE_ON_ERROR_FLAG="" + if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + CONTINUE_ON_ERROR_FLAG="--continue-on-error" + fi + python3 run_suite.py --suite per-commit-4-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG - accuracy-test-1-gpu: - needs: [check-changes, call-gate, stage-a-test-1] + unit-test-backend-8-gpu-h200: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] if: | always() && ( - (inputs.target_stage == 'accuracy-test-1-gpu') || + (inputs.target_stage == 'unit-test-backend-8-gpu-h200') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 1-gpu-runner + runs-on: 8-gpu-h200 + timeout-minutes: 60 env: - RUNNER_LABELS: 1-gpu-runner + RUNNER_LABELS: 8-gpu-h200 + strategy: + fail-fast: false + matrix: + part: [0, 1, 2, 3] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1471,31 +1314,48 @@ jobs: timeout-minutes: 10 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . - - name: Evaluate accuracy - timeout-minutes: 25 + # - name: Warmup Weights and JIT Compilation + # timeout-minutes: 20 + # run: | + # # An example command for testing the warmup. TODO: make this more general and move them to python scripts. + # python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3-0324 --tp 8 --trust-remote-code + + - name: Run test + timeout-minutes: 20 run: | cd test/srt - python3 -m sglang.test.ci.run_with_retry test_eval_accuracy_large.py + RETRY_FLAG="" + if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then + RETRY_FLAG="--enable-retry" + fi + CONTINUE_ON_ERROR_FLAG="" + if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + CONTINUE_ON_ERROR_FLAG="--continue-on-error" + fi + python3 run_suite.py --suite per-commit-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG - accuracy-test-2-gpu: - needs: [check-changes, call-gate, accuracy-test-1-gpu] + unit-test-backend-8-gpu-h20: + needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] if: | always() && ( - (inputs.target_stage == 'accuracy-test-2-gpu') || + (inputs.target_stage == 'unit-test-backend-8-gpu-h20') || ( !inputs.target_stage && (github.event_name == 'schedule' || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) - runs-on: 2-gpu-runner + runs-on: 8-gpu-h20 + timeout-minutes: 60 env: - RUNNER_LABELS: 2-gpu-runner + SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" + RUNNER_LABELS: 8-gpu-h20 + strategy: + fail-fast: false + matrix: + part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1513,16 +1373,21 @@ jobs: - name: Install dependencies timeout-minutes: 10 run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . + CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_deepep.sh - - name: Evaluate accuracy (TP=2) - timeout-minutes: 25 + - name: Run test + timeout-minutes: 20 run: | cd test/srt - python3 -m sglang.test.ci.run_with_retry test_moe_eval_accuracy_large.py + RETRY_FLAG="" + if [[ "${{ needs.check-changes.outputs.enable_retry }}" == "true" ]]; then + RETRY_FLAG="--enable-retry" + fi + CONTINUE_ON_ERROR_FLAG="" + if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then + CONTINUE_ON_ERROR_FLAG="--continue-on-error" + fi + python3 run_suite.py --suite per-commit-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $RETRY_FLAG $CONTINUE_ON_ERROR_FLAG unit-test-deepep-4-gpu: needs: [check-changes, call-gate, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200] @@ -1537,6 +1402,7 @@ jobs: ) ) runs-on: 4-gpu-h100 + timeout-minutes: 60 env: RUNNER_LABELS: 4-gpu-h100 steps: @@ -1634,6 +1500,7 @@ jobs: ) ) runs-on: ${{ needs.check-changes.outputs.b200_runner }} + timeout-minutes: 60 env: RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} strategy: @@ -1687,6 +1554,7 @@ jobs: ) ) runs-on: 4-gpu-gb200 + timeout-minutes: 60 env: RUNNER_LABELS: 4-gpu-gb200 strategy: @@ -1746,18 +1614,16 @@ jobs: stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, + stage-b-test-small-1-gpu-performance, + stage-b-test-large-1-gpu-performance, + stage-b-test-large-2-gpu-performance, + stage-b-test-small-1-gpu-accuracy, + stage-b-test-large-2-gpu-accuracy, stage-c-test-large-4-gpu, - quantization-test, stage-b-test-4-gpu-b200, unit-test-backend-4-gpu, unit-test-backend-8-gpu-h20, unit-test-backend-8-gpu-h200, - performance-test-1-gpu-part-1, - performance-test-1-gpu-part-2, - performance-test-1-gpu-part-3, - performance-test-2-gpu, - accuracy-test-1-gpu, - accuracy-test-2-gpu, unit-test-deepep-4-gpu, # unit-test-deepep-8-gpu, # Disabled, see #17175 unit-test-backend-4-gpu-b200, diff --git a/test/srt/test_eval_accuracy_large.py b/test/registered/eval/test_eval_accuracy_large.py similarity index 94% rename from test/srt/test_eval_accuracy_large.py rename to test/registered/eval/test_eval_accuracy_large.py index efb202463e19..4de8f61fa138 100644 --- a/test/srt/test_eval_accuracy_large.py +++ b/test/registered/eval/test_eval_accuracy_large.py @@ -7,6 +7,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, @@ -18,6 +19,8 @@ write_github_step_summary, ) +register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy") + class TestEvalAccuracyLarge(CustomTestCase): @classmethod diff --git a/test/srt/test_moe_eval_accuracy_large.py b/test/registered/eval/test_moe_eval_accuracy_large.py similarity index 94% rename from test/srt/test_moe_eval_accuracy_large.py rename to test/registered/eval/test_moe_eval_accuracy_large.py index 26bbd247e1dd..76d1bf185526 100644 --- a/test/srt/test_moe_eval_accuracy_large.py +++ b/test/registered/eval/test_moe_eval_accuracy_large.py @@ -7,6 +7,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MOE_MODEL_NAME_FOR_TEST, @@ -18,6 +19,8 @@ write_github_step_summary, ) +register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy") + class TestMoEEvalAccuracyLarge(CustomTestCase): @classmethod diff --git a/test/registered/perf/test_bench_one_batch_1gpu.py b/test/registered/perf/test_bench_one_batch_1gpu.py new file mode 100644 index 000000000000..fbcc2ec05aec --- /dev/null +++ b/test/registered/perf/test_bench_one_batch_1gpu.py @@ -0,0 +1,39 @@ +import unittest + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, + CustomTestCase, + is_in_ci, + run_bench_offline_throughput, + run_bench_one_batch, + write_github_step_summary, +) + +register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance") + + +class TestBenchOneBatch1GPU(CustomTestCase): + + def test_bs1_small(self): + _, output_throughput, _ = run_bench_one_batch( + DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] + ) + self.assertGreater(output_throughput, 50) + + def test_bs1_default(self): + output_throughput = run_bench_offline_throughput( + DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_bs1_default (llama-3.1-8b)\n" + f"output_throughput: {output_throughput:.2f} token/s\n" + ) + self.assertGreater(output_throughput, 135) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_bench_one_batch.py b/test/registered/perf/test_bench_one_batch_2gpu.py similarity index 61% rename from test/srt/test_bench_one_batch.py rename to test/registered/perf/test_bench_one_batch_2gpu.py index 8d14cd0b7098..9af873bea9a0 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/registered/perf/test_bench_one_batch_2gpu.py @@ -1,40 +1,20 @@ import unittest +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST, - DEFAULT_SMALL_MODEL_NAME_FOR_TEST, CustomTestCase, is_in_amd_ci, is_in_ci, run_bench_offline_throughput, - run_bench_one_batch, write_github_step_summary, ) -# We use `run_bench_offline_throughput`` instead of `run_bench_one_batch` for most cases -# because `run_bench_offline_throughput`` has overlap scheduler. +register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance") -class TestBenchOneBatch(CustomTestCase): - - def test_bs1_small(self): - _, output_throughput, _ = run_bench_one_batch( - DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] - ) - self.assertGreater(output_throughput, 50) - - def test_bs1_default(self): - output_throughput = run_bench_offline_throughput( - DEFAULT_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"] - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_bs1_default (llama-3.1-8b)\n" - f"output_throughput: {output_throughput:.2f} token/s\n" - ) - self.assertGreater(output_throughput, 135) +class TestBenchOneBatch2GPU(CustomTestCase): def test_moe_tp2_bs1(self): output_throughput = run_bench_offline_throughput( diff --git a/test/registered/perf/test_bench_serving_1gpu_large.py b/test/registered/perf/test_bench_serving_1gpu_large.py new file mode 100644 index 000000000000..68860a015b7e --- /dev/null +++ b/test/registered/perf/test_bench_serving_1gpu_large.py @@ -0,0 +1,81 @@ +""" +Performance tests for single GPU that need H200 (80GB) - FP8 and EAGLE tests. +""" + +import unittest + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_DRAFT_MODEL_EAGLE, + DEFAULT_MODEL_NAME_FOR_TEST_FP8, + DEFAULT_TARGET_MODEL_EAGLE, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + run_bench_serving, + write_github_step_summary, +) + +register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance") + + +class TestBenchServing1GPULarge(CustomTestCase): + def test_offline_throughput_default_fp8(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST_FP8, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_default_fp8\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 3500) + else: + self.assertGreater(res["output_throughput"], 4300) + + def test_online_latency_eagle(self): + res = run_bench_serving( + model=DEFAULT_TARGET_MODEL_EAGLE, + num_prompts=300, + request_rate=8, + sharegpt_context_len=3072, + disable_ignore_eos=True, + dataset_name="sharegpt", + other_server_args=[ + "--speculative-algorithm", + "EAGLE", + "--speculative-draft-model-path", + DEFAULT_DRAFT_MODEL_EAGLE, + "--speculative-num-steps", + "5", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "16", + "--mem-fraction-static", + "0.7", + ], + need_warmup=True, + seed=42, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_online_latency_eagle\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"accept_length: {res['accept_length']:.2f} \n" + ) + if is_in_amd_ci(): + self.assertLess(res["median_e2e_latency_ms"], 1800) + else: + self.assertLess(res["median_e2e_latency_ms"], 900) + self.assertGreater(res["accept_length"], 3.0) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/perf/test_bench_serving_1gpu_part1.py b/test/registered/perf/test_bench_serving_1gpu_part1.py new file mode 100644 index 000000000000..76c7e7e6b83d --- /dev/null +++ b/test/registered/perf/test_bench_serving_1gpu_part1.py @@ -0,0 +1,258 @@ +""" +Performance tests for single GPU - LLM throughput/latency and LoRA tests. +Works on 5090 (32GB). +""" + +import asyncio +import itertools +import unittest + +import requests + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + run_bench_serving, + write_github_step_summary, +) + +register_cuda_ci(est_time=1000, suite="stage-b-test-large-1-gpu-performance") + + +class TestBenchServing1GPUPart1(CustomTestCase): + def test_offline_throughput_default(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_default\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 3050) + else: + self.assertGreater(res["output_throughput"], 3800) + + def test_offline_throughput_non_stream_small_batch_size(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=200, + request_rate=float("inf"), + other_server_args=["--max-running-requests", "10"], + dataset_name="sharegpt", + random_input_len=None, + random_output_len=None, + disable_stream=True, + need_warmup=True, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_non_stream_small_batch_size\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 1000) + else: + self.assertGreater(res["output_throughput"], 1050) + + def test_offline_throughput_without_radix_cache(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=["--disable-radix-cache"], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_without_radix_cache\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 3050) + else: + self.assertGreater(res["output_throughput"], 3800) + + def test_offline_throughput_without_chunked_prefill(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=["--chunked-prefill-size", "-1"], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_without_chunked_prefill\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + self.assertGreater(res["output_throughput"], 2600) + + def test_offline_throughput_with_triton_attention_backend(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[ + "--attention-backend", + "triton", + "--context-length", + "8192", + ], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_offline_throughput_with_triton_attention_backend\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 3500) + else: + self.assertGreater(res["output_throughput"], 3700) + + def test_online_latency_default(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=100, + request_rate=1, + other_server_args=[], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_online_latency_default\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 11000) + if is_in_amd_ci(): + self.assertLess(res["median_ttft_ms"], 115) + else: + self.assertLess(res["median_ttft_ms"], 86) + self.assertLess(res["median_itl_ms"], 10) + + def test_lora_online_latency(self): + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=False) + + if is_in_ci(): + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 2400) + self.assertLess(res["median_ttft_ms"], 58) + + def test_lora_online_latency_with_concurrent_adapter_updates(self): + if is_in_amd_ci(): + pass + + res = self._run_lora_latency_test(enable_background_task=True) + + if is_in_ci(): + write_github_step_summary( + f"### test_lora_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 4000) + self.assertLess(res["median_ttft_ms"], 80) + + def _run_lora_latency_test(self, enable_background_task: bool): + """ + Run a latency test for LoRA with the specified background task setting. + """ + + async def lora_loader_unloader_task( + base_url: str, + start_event: asyncio.Event, + stop_event: asyncio.Event, + ): + """ + A background task that repeatedly loads and unloads a LoRA adapter. + """ + await start_event.wait() + + path_cycler = itertools.cycle( + [ + "pbevan11/llama-3.1-8b-ocr-correction", + "faridlazuarda/valadapt-llama-3.1-8B-it-chinese", + "philschmid/code-llama-3-1-8b-text-to-sql-lora", + ] + ) + load_url = f"{base_url}/load_lora_adapter" + unload_url = f"{base_url}/unload_lora_adapter" + num_updates = 0 + + while not stop_event.is_set(): + lora_path = next(path_cycler) + response = await asyncio.to_thread( + requests.post, + load_url, + json={"lora_name": lora_path, "lora_path": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to load LoRA adapter: {response.text}" + ) + num_updates += 1 + + if stop_event.is_set(): + break + + await asyncio.sleep(1) + + response = await asyncio.to_thread( + requests.post, + unload_url, + json={"lora_name": lora_path}, + ) + self.assertTrue( + response.ok, f"Failed to unload LoRA adapter: {response.text}" + ) + num_updates += 1 + + await asyncio.sleep(1) + + background_task = lora_loader_unloader_task if enable_background_task else None + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=400, + request_rate=8, + other_server_args=[ + "--enable-lora", + "--max-loras-per-batch", + "1", + "--disable-radix-cache", + "--random-seed", + "42", + "--mem-fraction-static", + "0.8", + "--lora-paths", + "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16", + "--max-lora-rank", + "256", + ], + dataset_name="random", + random_input_len=256, + random_output_len=256, + lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"], + background_task=background_task, + ) + + return res + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/perf/test_bench_serving_1gpu_part2.py b/test/registered/perf/test_bench_serving_1gpu_part2.py new file mode 100644 index 000000000000..d33522233ca9 --- /dev/null +++ b/test/registered/perf/test_bench_serving_1gpu_part2.py @@ -0,0 +1,186 @@ +""" +Performance tests for single GPU - VLM, Score API, and Embeddings API tests. +Works on 5090 (32GB). +""" + +import unittest + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, + DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, + DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + run_bench_serving, + run_embeddings_benchmark, + run_score_benchmark, + write_github_step_summary, +) + +register_cuda_ci(est_time=900, suite="stage-b-test-large-1-gpu-performance") + + +class TestBenchServing1GPUPart2(CustomTestCase): + def test_vlm_offline_throughput(self): + res = run_bench_serving( + model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + num_prompts=200, + request_rate=float("inf"), + other_server_args=[ + "--mem-fraction-static", + "0.7", + ], + dataset_name="mmmu", + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_vlm_offline_throughput\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 2000) + else: + self.assertGreater(res["output_throughput"], 2500) + + def test_vlm_online_latency(self): + res = run_bench_serving( + model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + num_prompts=250, + request_rate=1, + other_server_args=[ + "--mem-fraction-static", + "0.7", + ], + dataset_name="mmmu", + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_vlm_online_latency\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 16500) + if is_in_amd_ci(): + self.assertLess(res["median_ttft_ms"], 150) + else: + self.assertLess(res["median_ttft_ms"], 100) + self.assertLess(res["median_itl_ms"], 8) + + def test_score_api_latency_throughput(self): + """Test score API latency and throughput performance""" + res = run_score_benchmark( + model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, + num_requests=1000, + batch_size=10, + other_server_args=[], + need_warmup=True, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_score_api_throughput\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Score API throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + self.assertLess(res["avg_latency_ms"], 48) + self.assertLess(res["p95_latency_ms"], 50) + self.assertGreater(res["throughput"], 20) + + def test_score_api_batch_scaling(self): + """Test score API performance with different batch sizes""" + batch_sizes = [10, 25, 50] + + for batch_size in batch_sizes: + res = run_score_benchmark( + model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, + num_requests=500, + batch_size=batch_size, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_score_api_batch_scaling_size_{batch_size}\n" + f"Batch size: {batch_size}\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + bounds = { + 10: (45, 50), + 25: (50, 60), + 50: (60, 65), + } + avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (60, 65)) + self.assertLess(res["avg_latency_ms"], avg_latency_bound) + self.assertLess(res["p95_latency_ms"], p95_latency_bound) + + def test_embeddings_api_latency_throughput(self): + """Test embeddings API latency and throughput performance""" + res = run_embeddings_benchmark( + model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, + num_requests=1000, + batch_size=1, + input_tokens=500, + other_server_args=[], + need_warmup=True, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_embeddings_api_throughput\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Embeddings API throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + self.assertLess(res["avg_latency_ms"], 20) + self.assertLess(res["p95_latency_ms"], 25) + self.assertGreater(res["throughput"], 60) + + def test_embeddings_api_batch_scaling(self): + """Test embeddings API performance with different batch sizes""" + batch_sizes = [10, 25, 50] + + for batch_size in batch_sizes: + res = run_embeddings_benchmark( + model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, + num_requests=500, + batch_size=batch_size, + input_tokens=500, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_embeddings_api_batch_scaling_size_{batch_size}\n" + f"Batch size: {batch_size}\n" + f"Average latency: {res['avg_latency_ms']:.2f} ms\n" + f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" + f"Throughput: {res['throughput']:.2f} req/s\n" + f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" + ) + + self.assertEqual(res["successful_requests"], res["total_requests"]) + bounds = { + 10: (60, 65), + 25: (115, 120), + 50: (190, 195), + } + avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (250, 250)) + self.assertLess(res["avg_latency_ms"], avg_latency_bound) + self.assertLess(res["p95_latency_ms"], p95_latency_bound) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/perf/test_bench_serving_2gpu.py b/test/registered/perf/test_bench_serving_2gpu.py new file mode 100644 index 000000000000..51c7358f0402 --- /dev/null +++ b/test/registered/perf/test_bench_serving_2gpu.py @@ -0,0 +1,107 @@ +""" +Performance tests for 2-GPU that need large GPUs (H200 80GB) - MoE and Pipeline Parallel tests. +""" + +import unittest + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_MOE_MODEL_NAME_FOR_TEST, + CustomTestCase, + is_in_amd_ci, + is_in_ci, + run_bench_serving, + write_github_step_summary, +) + +register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance") + + +class TestBenchServing2GPU(CustomTestCase): + def test_moe_offline_throughput_default(self): + res = run_bench_serving( + model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, + num_prompts=300, + request_rate=float("inf"), + other_server_args=["--tp", "2"], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_moe_offline_throughput_default\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 2100) + else: + self.assertGreater(res["output_throughput"], 2200) + + def test_moe_offline_throughput_without_radix_cache(self): + res = run_bench_serving( + model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, + num_prompts=300, + request_rate=float("inf"), + other_server_args=["--tp", "2", "--disable-radix-cache"], + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_moe_offline_throughput_without_radix_cache\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["output_throughput"], 2100) + else: + self.assertGreater(res["output_throughput"], 2200) + + def test_pp_offline_throughput_default_decode(self): + res = run_bench_serving( + model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, + num_prompts=1000, + request_rate=float("inf"), + random_input_len=1, + random_output_len=1024, + other_server_args=["--pp-size", "2"], + need_warmup=True, + seed=42, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_pp_offline_throughput_default_decode\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + self.assertGreater(res["output_throughput"], 6700) + + def test_pp_long_context_prefill(self): + res = run_bench_serving( + model="meta-llama/Llama-3.3-70B-Instruct", + num_prompts=4, + request_rate=float("inf"), + random_input_len=128000, + random_output_len=1, + dataset_name="random", + other_server_args=[ + "--quantization", + "fp8", + "--pp-size", + "2", + ] + + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []), + need_warmup=False, + seed=42, + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_pp_long_context_latency_prefill\n" + f"input_throughput: {res['input_throughput']:.2f} ms\n" + ) + if is_in_amd_ci(): + self.assertGreater(res["input_throughput"], 3000) + else: + self.assertGreater(res["input_throughput"], 4000) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/registered/perf/test_vlm_perf_5090.py b/test/registered/perf/test_vlm_perf_5090.py new file mode 100644 index 000000000000..772dc90d046f --- /dev/null +++ b/test/registered/perf/test_vlm_perf_5090.py @@ -0,0 +1,62 @@ +""" +VLM Performance tests that work on 5090 (32GB) - VLM offline throughput and online latency tests. +""" + +import unittest + +from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.test_utils import ( + DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + CustomTestCase, + is_in_ci, + run_bench_serving, + write_github_step_summary, +) + +register_cuda_ci(est_time=600, suite="stage-b-test-small-1-gpu-performance") + + +class TestVLMPerf5090(CustomTestCase): + def test_vlm_offline_throughput(self): + res = run_bench_serving( + model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + num_prompts=200, + request_rate=float("inf"), + other_server_args=[ + "--mem-fraction-static", + "0.7", + ], + dataset_name="mmmu", + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_vlm_offline_throughput (5090)\n" + f"Output throughput: {res['output_throughput']:.2f} token/s\n" + ) + self.assertGreater(res["output_throughput"], 2000) + + def test_vlm_online_latency(self): + res = run_bench_serving( + model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, + num_prompts=250, + request_rate=1, + other_server_args=[ + "--mem-fraction-static", + "0.7", + ], + dataset_name="mmmu", + ) + + if is_in_ci(): + write_github_step_summary( + f"### test_vlm_online_latency (5090)\n" + f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" + ) + self.assertLess(res["median_e2e_latency_ms"], 16500) + self.assertLess(res["median_ttft_ms"], 150) + self.assertLess(res["median_itl_ms"], 8) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/quant/test_awq.py b/test/registered/quant/test_awq.py similarity index 96% rename from test/srt/quant/test_awq.py rename to test/registered/quant/test_awq.py index 87e126adb048..42d2e7f523bb 100644 --- a/test/srt/quant/test_awq.py +++ b/test/registered/quant/test_awq.py @@ -2,6 +2,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST, @@ -11,6 +12,8 @@ popen_launch_server, ) +register_cuda_ci(est_time=163, suite="stage-b-test-large-1-gpu") + class TestAWQ(CustomTestCase): @classmethod diff --git a/test/srt/test_bnb.py b/test/registered/quant/test_bnb.py similarity index 98% rename from test/srt/test_bnb.py rename to test/registered/quant/test_bnb.py index 4328d56be965..814ec6a5e1b1 100644 --- a/test/srt/test_bnb.py +++ b/test/registered/quant/test_bnb.py @@ -12,6 +12,7 @@ import openai from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, @@ -21,6 +22,8 @@ popen_launch_server, ) +register_cuda_ci(est_time=5, suite="stage-b-test-small-1-gpu") + VISION_MODELS = [ "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit", "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit", diff --git a/test/srt/test_gguf.py b/test/registered/quant/test_gguf.py similarity index 86% rename from test/srt/test_gguf.py rename to test/registered/quant/test_gguf.py index e9776067ca9d..14448bf9b149 100644 --- a/test/srt/test_gguf.py +++ b/test/registered/quant/test_gguf.py @@ -3,8 +3,11 @@ from huggingface_hub import hf_hub_download import sglang as sgl +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import CustomTestCase +register_cuda_ci(est_time=96, suite="stage-b-test-small-1-gpu") + class TestGGUF(CustomTestCase): def test_models(self): diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/registered/quant/test_gptqmodel_dynamic.py similarity index 98% rename from test/srt/test_gptqmodel_dynamic.py rename to test/registered/quant/test_gptqmodel_dynamic.py index ea141df3e377..dd8fd51c989c 100644 --- a/test/srt/test_gptqmodel_dynamic.py +++ b/test/registered/quant/test_gptqmodel_dynamic.py @@ -6,6 +6,7 @@ from sglang.srt.server_args import set_global_server_args_for_scheduler from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_utils import ( DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, @@ -13,6 +14,8 @@ popen_launch_server, ) +register_cuda_ci(est_time=102, suite="stage-b-test-large-1-gpu") + def check_quant_method(model_path: str, use_marlin_kernel: bool): from sglang.srt.configs.device_config import DeviceConfig diff --git a/test/srt/quant/test_marlin_moe.py b/test/registered/quant/test_marlin_moe.py similarity index 99% rename from test/srt/quant/test_marlin_moe.py rename to test/registered/quant/test_marlin_moe.py index b1eb9c2da1e1..a37cf9fd191c 100644 --- a/test/srt/quant/test_marlin_moe.py +++ b/test/registered/quant/test_marlin_moe.py @@ -8,9 +8,12 @@ from sglang.srt.layers.activation import SiluAndMul from sglang.srt.layers.moe.fused_moe_triton.fused_marlin_moe import fused_marlin_moe from sglang.srt.server_args import ServerArgs, set_global_server_args_for_scheduler +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.test_marlin_utils import awq_marlin_quantize, marlin_quantize from sglang.test.test_utils import CustomTestCase +register_cuda_ci(est_time=200, suite="stage-b-test-small-1-gpu") + set_global_server_args_for_scheduler(object.__new__(ServerArgs)) diff --git a/test/srt/test_quantization.py b/test/registered/quant/test_quantization.py similarity index 97% rename from test/srt/test_quantization.py rename to test/registered/quant/test_quantization.py index a38dd61ff2b0..770b3855ab35 100644 --- a/test/srt/test_quantization.py +++ b/test/registered/quant/test_quantization.py @@ -4,6 +4,7 @@ from types import SimpleNamespace from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_cuda_ci from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1, @@ -15,6 +16,8 @@ write_results_to_json, ) +register_cuda_ci(est_time=185, suite="stage-b-test-large-1-gpu") + MODEL_SCORE_THRESHOLDS = { "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.825, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.825, diff --git a/test/run_suite.py b/test/run_suite.py index e12db55dbf18..815eead31102 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -28,8 +28,12 @@ HWBackend.CUDA: [ "stage-a-test-1", "stage-b-test-small-1-gpu", + "stage-b-test-small-1-gpu-performance", + "stage-b-test-small-1-gpu-accuracy", "stage-b-test-large-1-gpu", + "stage-b-test-large-1-gpu-performance", "stage-b-test-large-2-gpu", + "stage-b-test-large-2-gpu-performance", "stage-c-test-large-4-gpu", "stage-b-test-4-gpu-b200", "stage-c-test-large-4-gpu-b200", diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index fbc7c8154476..bf47a18e998d 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -53,21 +53,10 @@ # "per-commit-8-gpu-h200-deepep": [ # TestFile("ep/test_deepep_large.py", 563), # ], - "quantization_test": [ - TestFile("quant/test_awq.py", 163), - TestFile("quant/test_marlin_moe.py", 200), - TestFile("test_bnb.py", 5), - TestFile("test_gptqmodel_dynamic.py", 102), - TestFile("test_quantization.py", 185), - TestFile("test_gguf.py", 96), - ], + # quantization_test suite migrated to test/registered/quant/ "__not_in_ci__": [ TestFile("test_release_memory_occupation.py", 200), # Temporarily disabled TestFile("models/test_dummy_grok_models.py"), - TestFile("test_bench_one_batch.py"), - TestFile("test_bench_serving.py"), - TestFile("test_eval_accuracy_large.py"), - TestFile("test_moe_eval_accuracy_large.py"), TestFile("test_profile_v2.py"), TestFile("models/test_ministral3_models.py"), TestFile("test_mistral_large3_basic.py"), diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py deleted file mode 100644 index 9411a839695c..000000000000 --- a/test/srt/test_bench_serving.py +++ /dev/null @@ -1,566 +0,0 @@ -import asyncio -import itertools -import unittest - -import requests - -from sglang.test.test_utils import ( - DEFAULT_DRAFT_MODEL_EAGLE, - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_MODEL_NAME_FOR_TEST_FP8, - DEFAULT_MOE_MODEL_NAME_FOR_TEST, - DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, - DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, - DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, - DEFAULT_TARGET_MODEL_EAGLE, - CustomTestCase, - is_in_amd_ci, - is_in_ci, - run_bench_serving, - run_embeddings_benchmark, - run_score_benchmark, - write_github_step_summary, -) - - -class TestBenchServing(CustomTestCase): - def test_offline_throughput_default(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=[], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_default\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 3050) - else: - self.assertGreater(res["output_throughput"], 3800) - - def test_offline_throughput_non_stream_small_batch_size(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=200, - request_rate=float("inf"), - other_server_args=["--max-running-requests", "10"], - dataset_name="sharegpt", - random_input_len=None, - random_output_len=None, - disable_stream=True, - need_warmup=True, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_non_stream_small_batch_size\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 1000) - else: - self.assertGreater(res["output_throughput"], 1050) - - def test_offline_throughput_without_radix_cache(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=["--disable-radix-cache"], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_without_radix_cache\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 3050) - else: - self.assertGreater(res["output_throughput"], 3800) - - def test_offline_throughput_without_chunked_prefill(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=["--chunked-prefill-size", "-1"], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_without_chunked_prefill\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - self.assertGreater(res["output_throughput"], 2600) - - def test_offline_throughput_with_triton_attention_backend(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=[ - "--attention-backend", - "triton", - "--context-length", - "8192", - ], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_with_triton_attention_backend\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 3500) - else: - self.assertGreater(res["output_throughput"], 3700) - - def test_offline_throughput_default_fp8(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST_FP8, - num_prompts=500, - request_rate=float("inf"), - other_server_args=[], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_offline_throughput_default_fp8\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 3500) - else: - self.assertGreater(res["output_throughput"], 4300) - - def test_online_latency_default(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=100, - request_rate=1, - other_server_args=[], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_online_latency_default\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - ) - self.assertLess(res["median_e2e_latency_ms"], 11000) - if is_in_amd_ci(): - self.assertLess(res["median_ttft_ms"], 115) - else: - self.assertLess(res["median_ttft_ms"], 86) - self.assertLess(res["median_itl_ms"], 10) - - def test_vlm_offline_throughput(self): - res = run_bench_serving( - model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, - num_prompts=200, - request_rate=float("inf"), - other_server_args=[ - "--mem-fraction-static", - "0.7", - ], - dataset_name="mmmu", - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_vlm_offline_throughput\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 2000) - # TODO: not set yet, need AMD machine - else: - self.assertGreater(res["output_throughput"], 2500) - - def test_vlm_online_latency(self): - res = run_bench_serving( - model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, - num_prompts=250, - request_rate=1, - other_server_args=[ - "--mem-fraction-static", - "0.7", - ], - dataset_name="mmmu", - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_vlm_online_latency\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - ) - self.assertLess(res["median_e2e_latency_ms"], 16500) - if is_in_amd_ci(): - self.assertLess(res["median_ttft_ms"], 150) - # TODO: not set yet, need AMD machine - else: - self.assertLess(res["median_ttft_ms"], 100) - self.assertLess(res["median_itl_ms"], 8) - - def test_lora_online_latency(self): - # TODO (lifuhuang): verify LoRA support in AMD. - if is_in_amd_ci(): - pass - - res = self._run_lora_latency_test(enable_background_task=False) - - if is_in_ci(): - write_github_step_summary( - f"### test_lora_online_latency\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" - ) - self.assertLess(res["median_e2e_latency_ms"], 2400) - self.assertLess(res["median_ttft_ms"], 58) - - def test_lora_online_latency_with_concurrent_adapter_updates(self): - # TODO (lifuhuang): verify LoRA support in AMD. - if is_in_amd_ci(): - pass - - res = self._run_lora_latency_test(enable_background_task=True) - - if is_in_ci(): - write_github_step_summary( - f"### test_lora_online_latency\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n" - ) - self.assertLess(res["median_e2e_latency_ms"], 4000) - self.assertLess(res["median_ttft_ms"], 80) - - def _run_lora_latency_test(self, enable_background_task: bool): - """ - Run a latency test for LoRA with the specified background task setting. - """ - - async def lora_loader_unloader_task( - base_url: str, - start_event: asyncio.Event, - stop_event: asyncio.Event, - ): - """ - A background task that repeatedly loads and unloads a LoRA adapter. - """ - await start_event.wait() - - path_cycler = itertools.cycle( - [ - "pbevan11/llama-3.1-8b-ocr-correction", - "faridlazuarda/valadapt-llama-3.1-8B-it-chinese", - "philschmid/code-llama-3-1-8b-text-to-sql-lora", - ] - ) - load_url = f"{base_url}/load_lora_adapter" - unload_url = f"{base_url}/unload_lora_adapter" - num_updates = 0 - - while not stop_event.is_set(): - # 1. Load the LoRA adapter - lora_path = next(path_cycler) - response = await asyncio.to_thread( - requests.post, - load_url, - json={"lora_name": lora_path, "lora_path": lora_path}, - ) - self.assertTrue( - response.ok, f"Failed to load LoRA adapter: {response.text}" - ) - num_updates += 1 - - if stop_event.is_set(): - break - - # Yield control to allow other tasks to run. - await asyncio.sleep(1) - - # 2. Unload the LoRA adapter - response = await asyncio.to_thread( - requests.post, - unload_url, - json={"lora_name": lora_path}, - ) - self.assertTrue( - response.ok, f"Failed to unload LoRA adapter: {response.text}" - ) - num_updates += 1 - - # Yield control to allow other tasks to run. - await asyncio.sleep(1) - - background_task = lora_loader_unloader_task if enable_background_task else None - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=400, - request_rate=8, - other_server_args=[ - "--enable-lora", - "--max-loras-per-batch", - "1", - "--disable-radix-cache", - "--random-seed", - "42", - "--mem-fraction-static", - "0.8", - "--lora-paths", - "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16", - "--max-lora-rank", - "256", - ], - dataset_name="random", - random_input_len=256, - random_output_len=256, - lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"], - background_task=background_task, - ) - - return res - - def test_online_latency_eagle(self): - res = run_bench_serving( - model=DEFAULT_TARGET_MODEL_EAGLE, - num_prompts=300, - request_rate=8, - sharegpt_context_len=3072, - disable_ignore_eos=True, - dataset_name="sharegpt", - other_server_args=[ - "--speculative-algorithm", - "EAGLE", - "--speculative-draft-model-path", - DEFAULT_DRAFT_MODEL_EAGLE, - "--speculative-num-steps", - "5", - "--speculative-eagle-topk", - "4", - "--speculative-num-draft-tokens", - "16", - "--mem-fraction-static", - "0.7", - ], - need_warmup=True, - seed=42, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_online_latency_eagle\n" - f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n" - f"accept_length: {res['accept_length']:.2f} \n" - ) - if is_in_amd_ci(): - self.assertLess(res["median_e2e_latency_ms"], 1800) - else: - self.assertLess(res["median_e2e_latency_ms"], 900) - self.assertGreater(res["accept_length"], 3.0) - - def test_moe_offline_throughput_default(self): - res = run_bench_serving( - model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, - num_prompts=300, - request_rate=float("inf"), - other_server_args=["--tp", "2"], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_moe_offline_throughput_default\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 2100) - else: - self.assertGreater(res["output_throughput"], 2200) - - def test_moe_offline_throughput_without_radix_cache(self): - res = run_bench_serving( - model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, - num_prompts=300, - request_rate=float("inf"), - other_server_args=["--tp", "2", "--disable-radix-cache"], - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_moe_offline_throughput_without_radix_cache\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["output_throughput"], 2100) - else: - self.assertGreater(res["output_throughput"], 2200) - - def test_pp_offline_throughput_default_decode(self): - res = run_bench_serving( - model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, - num_prompts=1000, - request_rate=float("inf"), - random_input_len=1, - random_output_len=1024, - other_server_args=["--pp-size", "2"], - need_warmup=True, - seed=42, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_pp_offline_throughput_default_decode\n" - f"Output throughput: {res['output_throughput']:.2f} token/s\n" - ) - self.assertGreater(res["output_throughput"], 6700) - - def test_pp_long_context_prefill(self): - res = run_bench_serving( - model="meta-llama/Llama-3.3-70B-Instruct", - num_prompts=4, - request_rate=float("inf"), - random_input_len=128000, - random_output_len=1, - dataset_name="random", - other_server_args=[ - "--quantization", - "fp8", - "--pp-size", - "2", - ] - + (["--mem-fraction-static", "0.7"] if is_in_amd_ci() else []), - need_warmup=False, - seed=42, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_pp_long_context_latency_prefill\n" - f"input_throughput: {res['input_throughput']:.2f} ms\n" - ) - if is_in_amd_ci(): - self.assertGreater(res["input_throughput"], 3000) - else: - self.assertGreater(res["input_throughput"], 4000) - - def test_score_api_latency_throughput(self): - """Test score API latency and throughput performance""" - res = run_score_benchmark( - model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, - num_requests=1000, - batch_size=10, - other_server_args=[], - need_warmup=True, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_score_api_throughput\n" - f"Average latency: {res['avg_latency_ms']:.2f} ms\n" - f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" - f"Score API throughput: {res['throughput']:.2f} req/s\n" - f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" - ) - - self.assertEqual(res["successful_requests"], res["total_requests"]) - self.assertLess(res["avg_latency_ms"], 48) - self.assertLess(res["p95_latency_ms"], 50) - self.assertGreater(res["throughput"], 20) - - def test_score_api_batch_scaling(self): - """Test score API performance with different batch sizes""" - batch_sizes = [10, 25, 50] - - for batch_size in batch_sizes: - res = run_score_benchmark( - model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE, - num_requests=500, - batch_size=batch_size, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_score_api_batch_scaling_size_{batch_size}\n" - f"Batch size: {batch_size}\n" - f"Average latency: {res['avg_latency_ms']:.2f} ms\n" - f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" - f"Throughput: {res['throughput']:.2f} req/s\n" - f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" - ) - - self.assertEqual(res["successful_requests"], res["total_requests"]) - bounds = { - 10: (45, 50), - 25: (50, 60), - 50: (60, 65), - } - avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (60, 65)) - self.assertLess(res["avg_latency_ms"], avg_latency_bound) - self.assertLess(res["p95_latency_ms"], p95_latency_bound) - - def test_embeddings_api_latency_throughput(self): - """Test embeddings API latency and throughput performance""" - res = run_embeddings_benchmark( - model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, - num_requests=1000, - batch_size=1, - input_tokens=500, - other_server_args=[], - need_warmup=True, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_embeddings_api_throughput\n" - f"Average latency: {res['avg_latency_ms']:.2f} ms\n" - f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" - f"Embeddings API throughput: {res['throughput']:.2f} req/s\n" - f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" - ) - - self.assertEqual(res["successful_requests"], res["total_requests"]) - # Bounds based on actual performance on 1xH100: avg=15ms, p95=15ms, throughput=67req/s - self.assertLess(res["avg_latency_ms"], 20) - self.assertLess(res["p95_latency_ms"], 25) - self.assertGreater(res["throughput"], 60) - - def test_embeddings_api_batch_scaling(self): - """Test embeddings API performance with different batch sizes""" - batch_sizes = [10, 25, 50] - - for batch_size in batch_sizes: - res = run_embeddings_benchmark( - model=DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, - num_requests=500, - batch_size=batch_size, - input_tokens=500, - ) - - if is_in_ci(): - write_github_step_summary( - f"### test_embeddings_api_batch_scaling_size_{batch_size}\n" - f"Batch size: {batch_size}\n" - f"Average latency: {res['avg_latency_ms']:.2f} ms\n" - f"P95 latency: {res['p95_latency_ms']:.2f} ms\n" - f"Throughput: {res['throughput']:.2f} req/s\n" - f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n" - ) - - self.assertEqual(res["successful_requests"], res["total_requests"]) - bounds = { - 10: (60, 65), - 25: (115, 120), - 50: (190, 195), - } - avg_latency_bound, p95_latency_bound = bounds.get(batch_size, (250, 250)) - self.assertLess(res["avg_latency_ms"], avg_latency_bound) - self.assertLess(res["p95_latency_ms"], p95_latency_bound) - - -if __name__ == "__main__": - unittest.main()