diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index a3f5e78de3cc..ef81c713ec65 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -274,19 +274,14 @@ jobs: const pollIntervalSeconds = 120; // 2 minutes to reduce GH API calls const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds; - // Stage-b jobs to wait for (all stage-b tests including performance and accuracy) + // Stage-b jobs to wait for const stageBJobs = [ { prefix: 'stage-b-test-small-1-gpu', expectedCount: 8 }, // partitions 0-7 - { prefix: 'stage-b-test-large-1-gpu', expectedCount: 12 }, // partitions 0-11 - { prefix: 'stage-b-test-large-2-gpu', expectedCount: 2 }, // partitions 0-1 + { prefix: 'stage-b-test-large-1-gpu', expectedCount: 14 }, // partitions 0-13 + { prefix: 'stage-b-test-large-2-gpu', expectedCount: 4 }, // partitions 0-3 { prefix: 'stage-b-test-4-gpu-b200', expectedCount: 1 }, - { prefix: 'stage-b-test-small-1-gpu-performance', expectedCount: 1 }, - { prefix: 'stage-b-test-large-1-gpu-performance', expectedCount: 2 }, // partitions 0-1 - { prefix: 'stage-b-test-large-2-gpu-performance', expectedCount: 1 }, - { prefix: 'stage-b-test-small-1-gpu-accuracy', expectedCount: 1 }, - { prefix: 'stage-b-test-large-2-gpu-accuracy', expectedCount: 1 } ]; - const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0); // 29 total + const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0); // 27 total // Helper to match job names exactly (prefix alone or prefix + " (N)" for matrix jobs) const matchesPrefix = (jobName, prefix) => { @@ -841,6 +836,9 @@ jobs: run: | source /etc/profile.d/sglang-ci.sh CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . - name: Run test timeout-minutes: 30 @@ -874,7 +872,7 @@ jobs: fail-fast: false max-parallel: ${{ fromJson(needs.check-changes.outputs.max_parallel) }} matrix: - partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] + partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] steps: - name: Checkout code uses: actions/checkout@v4 @@ -902,7 +900,7 @@ jobs: if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 12 $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 14 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG stage-b-test-large-2-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] @@ -923,245 +921,7 @@ jobs: strategy: fail-fast: false matrix: - partition: [0, 1] - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG - - stage-b-test-small-1-gpu-performance: - needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-performance') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 1-gpu-5090 - timeout-minutes: 240 - env: - RUNNER_LABELS: 1-gpu-5090 - IS_BLACKWELL: "1" - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - source /etc/profile.d/sglang-ci.sh - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - source /etc/profile.d/sglang-ci.sh - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-performance $CONTINUE_ON_ERROR_FLAG - - stage-b-test-large-1-gpu-performance: - needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-b-test-large-1-gpu-performance') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 1-gpu-runner - timeout-minutes: 240 - env: - RUNNER_LABELS: 1-gpu-runner - strategy: - fail-fast: false - matrix: - partition: [0, 1] - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 40 - run: | - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu-performance --auto-partition-id ${{ matrix.partition }} --auto-partition-size 2 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG - - stage-b-test-large-2-gpu-performance: - needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-b-test-large-2-gpu-performance') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 2-gpu-runner - timeout-minutes: 240 - env: - RUNNER_LABELS: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-performance $CONTINUE_ON_ERROR_FLAG - - stage-b-test-small-1-gpu-accuracy: - needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-b-test-small-1-gpu-accuracy') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 1-gpu-5090 - timeout-minutes: 240 - env: - RUNNER_LABELS: 1-gpu-5090 - IS_BLACKWELL: "1" - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - - - name: Download artifacts - if: needs.check-changes.outputs.sgl_kernel == 'true' - uses: actions/download-artifact@v4 - with: - path: sgl-kernel/dist/ - merge-multiple: true - pattern: wheel-python3.10-cuda12.9 - - - name: Install dependencies - timeout-minutes: 10 - run: | - source /etc/profile.d/sglang-ci.sh - CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . - - - name: Run test - timeout-minutes: 25 - run: | - source /etc/profile.d/sglang-ci.sh - cd test/ - CONTINUE_ON_ERROR_FLAG="" - if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then - CONTINUE_ON_ERROR_FLAG="--continue-on-error" - fi - python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu-accuracy $CONTINUE_ON_ERROR_FLAG - - stage-b-test-large-2-gpu-accuracy: - needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] - if: | - always() && - ( - (inputs.target_stage == 'stage-b-test-large-2-gpu-accuracy') || - ( - !inputs.target_stage && - ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && - ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) - ) - ) - runs-on: 2-gpu-runner - timeout-minutes: 240 - env: - RUNNER_LABELS: 2-gpu-runner + partition: [0, 1, 2, 3] steps: - name: Checkout code uses: actions/checkout@v4 @@ -1185,14 +945,14 @@ jobs: pip install -e . - name: Run test - timeout-minutes: 25 + timeout-minutes: 30 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi - python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu-accuracy $CONTINUE_ON_ERROR_FLAG + python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG stage-b-test-4-gpu-b200: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] @@ -1829,11 +1589,6 @@ jobs: stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, - stage-b-test-small-1-gpu-performance, - stage-b-test-large-1-gpu-performance, - stage-b-test-large-2-gpu-performance, - stage-b-test-small-1-gpu-accuracy, - stage-b-test-large-2-gpu-accuracy, stage-c-test-large-4-gpu, stage-b-test-4-gpu-b200, unit-test-backend-4-gpu, diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 2f4dac608f7d..fe8c3f9c36de 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -1495,6 +1495,10 @@ def run_bench_one_batch(model, other_args): command += ["--model-path", model] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + prefill_latency = None + decode_throughput = None + decode_latency = None + try: stdout, stderr = process.communicate() output = stdout.decode(errors="backslashreplace") @@ -1517,6 +1521,12 @@ def run_bench_one_batch(model, other_args): finally: kill_process_tree(process.pid) + if prefill_latency is None or decode_throughput is None or decode_latency is None: + raise RuntimeError( + f"Failed to parse benchmark output. " + f"prefill_latency={prefill_latency}, decode_throughput={decode_throughput}, decode_latency={decode_latency}" + ) + return prefill_latency, decode_throughput, decode_latency diff --git a/test/registered/eval/test_eval_accuracy_large.py b/test/registered/eval/test_eval_accuracy_large.py index 901280b5047e..486e45afaab4 100644 --- a/test/registered/eval/test_eval_accuracy_large.py +++ b/test/registered/eval/test_eval_accuracy_large.py @@ -19,8 +19,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy") -register_amd_ci(est_time=300, suite="stage-b-test-small-1-gpu-accuracy-amd") +register_cuda_ci(est_time=300, suite="stage-b-test-small-1-gpu") +register_amd_ci(est_time=300, suite="stage-b-test-small-1-gpu-amd") class TestEvalAccuracyLarge(CustomTestCase): diff --git a/test/registered/eval/test_moe_eval_accuracy_large.py b/test/registered/eval/test_moe_eval_accuracy_large.py index f6afaf090559..ae90f768e6c8 100644 --- a/test/registered/eval/test_moe_eval_accuracy_large.py +++ b/test/registered/eval/test_moe_eval_accuracy_large.py @@ -19,8 +19,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy") -register_amd_ci(est_time=500, suite="stage-b-test-large-2-gpu-accuracy-amd") +register_cuda_ci(est_time=500, suite="stage-b-test-large-2-gpu") +register_amd_ci(est_time=500, suite="stage-b-test-large-2-gpu-amd") class TestMoEEvalAccuracyLarge(CustomTestCase): diff --git a/test/registered/perf/test_bench_one_batch_1gpu.py b/test/registered/perf/test_bench_one_batch_1gpu.py index cdb0d6a3460d..5ab4b202c19c 100644 --- a/test/registered/perf/test_bench_one_batch_1gpu.py +++ b/test/registered/perf/test_bench_one_batch_1gpu.py @@ -11,8 +11,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance") -register_amd_ci(est_time=120, suite="stage-b-test-large-1-gpu-performance-amd") +register_cuda_ci(est_time=120, suite="stage-b-test-large-1-gpu") +register_amd_ci(est_time=120, suite="stage-b-test-large-1-gpu-amd") class TestBenchOneBatch1GPU(CustomTestCase): diff --git a/test/registered/perf/test_bench_one_batch_2gpu.py b/test/registered/perf/test_bench_one_batch_2gpu.py index d6f1c2689972..b36e775dbf93 100644 --- a/test/registered/perf/test_bench_one_batch_2gpu.py +++ b/test/registered/perf/test_bench_one_batch_2gpu.py @@ -11,8 +11,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance") -register_amd_ci(est_time=630, suite="stage-b-test-large-2-gpu-performance-amd") +register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu") +register_amd_ci(est_time=630, suite="stage-b-test-large-2-gpu-amd") class TestBenchOneBatch2GPU(CustomTestCase): diff --git a/test/registered/perf/test_bench_serving_1gpu_large.py b/test/registered/perf/test_bench_serving_1gpu_large.py index 2671bde7e26b..6dd8c42498bc 100644 --- a/test/registered/perf/test_bench_serving_1gpu_large.py +++ b/test/registered/perf/test_bench_serving_1gpu_large.py @@ -17,8 +17,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance") -register_amd_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance-amd") +register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu") +register_amd_ci(est_time=300, suite="stage-b-test-large-1-gpu-amd") class TestBenchServing1GPULarge(CustomTestCase): diff --git a/test/registered/perf/test_bench_serving_1gpu_part1.py b/test/registered/perf/test_bench_serving_1gpu_part1.py index aee04425f05c..12629c9d1c5a 100644 --- a/test/registered/perf/test_bench_serving_1gpu_part1.py +++ b/test/registered/perf/test_bench_serving_1gpu_part1.py @@ -19,8 +19,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=1000, suite="stage-b-test-large-1-gpu-performance") -register_amd_ci(est_time=1100, suite="stage-b-test-large-1-gpu-performance-amd") +register_cuda_ci(est_time=1000, suite="stage-b-test-large-1-gpu") +register_amd_ci(est_time=1100, suite="stage-b-test-large-1-gpu-amd") class TestBenchServing1GPUPart1(CustomTestCase): diff --git a/test/registered/perf/test_bench_serving_1gpu_part2.py b/test/registered/perf/test_bench_serving_1gpu_part2.py index 24a02b3fde57..6730e2e6733d 100644 --- a/test/registered/perf/test_bench_serving_1gpu_part2.py +++ b/test/registered/perf/test_bench_serving_1gpu_part2.py @@ -19,8 +19,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=900, suite="stage-b-test-large-1-gpu-performance") -register_amd_ci(est_time=900, suite="stage-b-test-large-1-gpu-performance-amd") +register_cuda_ci(est_time=900, suite="stage-b-test-large-1-gpu") +register_amd_ci(est_time=900, suite="stage-b-test-large-1-gpu-amd") class TestBenchServing1GPUPart2(CustomTestCase): diff --git a/test/registered/perf/test_bench_serving_2gpu.py b/test/registered/perf/test_bench_serving_2gpu.py index 4c4bb8214b90..3c8cc216aacb 100644 --- a/test/registered/perf/test_bench_serving_2gpu.py +++ b/test/registered/perf/test_bench_serving_2gpu.py @@ -14,8 +14,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance") -register_amd_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance-amd") +register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu") +register_amd_ci(est_time=600, suite="stage-b-test-large-2-gpu-amd") class TestBenchServing2GPU(CustomTestCase): diff --git a/test/registered/perf/test_vlm_perf_5090.py b/test/registered/perf/test_vlm_perf_5090.py index 414329c9562c..389e4dc85ad4 100644 --- a/test/registered/perf/test_vlm_perf_5090.py +++ b/test/registered/perf/test_vlm_perf_5090.py @@ -13,8 +13,8 @@ write_github_step_summary, ) -register_cuda_ci(est_time=600, suite="stage-b-test-small-1-gpu-performance") -register_amd_ci(est_time=500, suite="stage-b-test-small-1-gpu-performance-amd") +register_cuda_ci(est_time=600, suite="stage-b-test-small-1-gpu") +register_amd_ci(est_time=500, suite="stage-b-test-small-1-gpu-amd") class TestVLMPerf5090(CustomTestCase): diff --git a/test/run_suite.py b/test/run_suite.py index 6c08f9ca99a9..45cd6299f1c6 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -22,6 +22,7 @@ "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", "stage-b-test-small-1-gpu-amd-mi35x", + "stage-b-test-large-1-gpu-amd", "stage-b-test-large-2-gpu-amd", "stage-b-test-small-1-gpu-performance-amd", "stage-b-test-large-1-gpu-performance-amd", @@ -33,12 +34,8 @@ HWBackend.CUDA: [ "stage-a-test-1", "stage-b-test-small-1-gpu", - "stage-b-test-small-1-gpu-performance", - "stage-b-test-small-1-gpu-accuracy", "stage-b-test-large-1-gpu", - "stage-b-test-large-1-gpu-performance", "stage-b-test-large-2-gpu", - "stage-b-test-large-2-gpu-performance", "stage-c-test-large-4-gpu", "stage-b-test-4-gpu-b200", "stage-c-test-large-4-gpu-b200",