diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 24fbdcd2b8dd..47db7182f9be 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -224,6 +224,45 @@ jobs: run: | bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 + stage-b-test-large-2-gpu-amd: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-large-2-gpu-amd') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-2] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd_ci_install_dependency.sh + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd + multimodal-gen-test-1-gpu-amd: needs: [check-changes] if: needs.check-changes.outputs.multimodal_gen == 'true' @@ -884,6 +923,7 @@ jobs: stage-a-test-1-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-large-2-gpu-amd, unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd, diff --git a/scripts/ci/slash_command_handler.py b/scripts/ci/slash_command_handler.py index 6d4558dd43b6..14ab0c245d03 100644 --- a/scripts/ci/slash_command_handler.py +++ b/scripts/ci/slash_command_handler.py @@ -178,6 +178,7 @@ def handle_rerun_stage( "sgl-kernel-unit-test-amd", "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", + "stage-b-test-large-2-gpu-amd", "unit-test-backend-1-gpu-amd", "unit-test-backend-2-gpu-amd", "unit-test-backend-8-gpu-amd", diff --git a/test/srt/rl/test_fp32_lm_head.py b/test/registered/rl/test_fp32_lm_head.py similarity index 95% rename from test/srt/rl/test_fp32_lm_head.py rename to test/registered/rl/test_fp32_lm_head.py index cf6dd28398f1..44740eda998b 100644 --- a/test/srt/rl/test_fp32_lm_head.py +++ b/test/registered/rl/test_fp32_lm_head.py @@ -1,3 +1,8 @@ +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci + +register_cuda_ci(est_time=9, suite="stage-b-test-small-1-gpu") +register_amd_ci(est_time=15, suite="stage-b-test-small-1-gpu") + import unittest from types import SimpleNamespace from unittest.mock import patch diff --git a/test/srt/test_patch_torch.py b/test/registered/rl/test_patch_torch.py similarity index 94% rename from test/srt/test_patch_torch.py rename to test/registered/rl/test_patch_torch.py index c1319dacb7ce..06bc16f15e3b 100644 --- a/test/srt/test_patch_torch.py +++ b/test/registered/rl/test_patch_torch.py @@ -1,3 +1,10 @@ +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci + +register_cuda_ci(est_time=19, suite="stage-b-test-large-2-gpu") +register_amd_ci( + est_time=19, suite="stage-b-test-large-2-gpu-amd", disabled="see #11127" +) + import os import traceback import unittest diff --git a/test/srt/rl/test_return_routed_experts.py b/test/registered/rl/test_return_routed_experts.py similarity index 98% rename from test/srt/rl/test_return_routed_experts.py rename to test/registered/rl/test_return_routed_experts.py index da995cb17b0e..b2b7c26fc7d9 100644 --- a/test/srt/rl/test_return_routed_experts.py +++ b/test/registered/rl/test_return_routed_experts.py @@ -1,3 +1,7 @@ +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=180, suite="stage-c-test-large-4-gpu") + import asyncio import logging import unittest diff --git a/test/srt/rl/test_update_weights_from_disk.py b/test/registered/rl/test_update_weights_from_disk.py similarity index 98% rename from test/srt/rl/test_update_weights_from_disk.py rename to test/registered/rl/test_update_weights_from_disk.py index 0127b98dc0f3..dc9535525675 100644 --- a/test/srt/rl/test_update_weights_from_disk.py +++ b/test/registered/rl/test_update_weights_from_disk.py @@ -1,3 +1,8 @@ +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci + +register_cuda_ci(est_time=210, suite="stage-b-test-small-1-gpu", disabled="see #14021") +register_amd_ci(est_time=210, suite="stage-b-test-small-1-gpu", disabled="see #14021") + import json import random import time diff --git a/test/srt/rl/test_update_weights_from_distributed.py b/test/registered/rl/test_update_weights_from_distributed.py similarity index 99% rename from test/srt/rl/test_update_weights_from_distributed.py rename to test/registered/rl/test_update_weights_from_distributed.py index e4834266006f..2ca428338a43 100644 --- a/test/srt/rl/test_update_weights_from_distributed.py +++ b/test/registered/rl/test_update_weights_from_distributed.py @@ -1,3 +1,8 @@ +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci + +register_cuda_ci(est_time=103, suite="stage-b-test-large-2-gpu") +register_amd_ci(est_time=103, suite="stage-b-test-large-2-gpu-amd") + """Test distributed weight updates. This test suite simulates a distributed training environment to ensure diff --git a/test/srt/rl/test_update_weights_from_tensor.py b/test/registered/rl/test_update_weights_from_tensor.py similarity index 98% rename from test/srt/rl/test_update_weights_from_tensor.py rename to test/registered/rl/test_update_weights_from_tensor.py index 04dd01d96c4a..9faff8e26870 100644 --- a/test/srt/rl/test_update_weights_from_tensor.py +++ b/test/registered/rl/test_update_weights_from_tensor.py @@ -1,3 +1,7 @@ +from sglang.test.ci.ci_register import register_cuda_ci + +register_cuda_ci(est_time=195, suite="stage-b-test-small-1-gpu") + import gc import json import random diff --git a/test/run_suite.py b/test/run_suite.py index a4c73a011895..7d465f326130 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -18,7 +18,11 @@ # Per-commit test suites (run on every PR) PER_COMMIT_SUITES = { HWBackend.CPU: ["default", "stage-a-cpu-only"], - HWBackend.AMD: ["stage-a-test-1", "stage-b-test-small-1-gpu"], + HWBackend.AMD: [ + "stage-a-test-1", + "stage-b-test-small-1-gpu", + "stage-b-test-large-2-gpu-amd", + ], HWBackend.CUDA: [ "stage-a-test-1", "stage-b-test-small-1-gpu", diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index c82a53f98bc1..3fd61fff69a5 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -28,10 +28,6 @@ TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 6), TestFile("openai_server/validation/test_request_length_validation.py", 38), TestFile("ops/test_repeat_interleave.py", 60), - # quant tests moved to test/registered/quant/ - TestFile("rl/test_fp32_lm_head.py", 9), - # TestFile("rl/test_update_weights_from_disk.py", 210), # Temporarily disabled, see https://github.com/sgl-project/sglang/pull/13998 - TestFile("rl/test_update_weights_from_tensor.py", 195), TestFile("dllm/test_llada2_mini.py", 520), TestFile("test_abort.py", 131), TestFile("test_chunked_prefill.py", 312), @@ -87,12 +83,10 @@ TestFile("hicache/test_hicache_storage_mooncake_backend.py", 300), TestFile("models/test_kimi_linear_models.py", 90), TestFile("models/test_nvidia_nemotron_nano_v2.py", 132), - TestFile("rl/test_update_weights_from_distributed.py", 103), TestFile("test_data_parallelism.py", 73), TestFile("test_disaggregation_basic.py", 400), TestFile("test_dp_attention.py", 350), TestFile("test_load_weights_from_remote_instance.py", 72), - TestFile("test_patch_torch.py", 19), ], "per-commit-4-gpu": [ TestFile("models/test_qwen3_next_models.py", 650), @@ -100,7 +94,6 @@ TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_pp_single_node.py", 500), TestFile("test_epd_disaggregation.py", 150), - TestFile("rl/test_return_routed_experts.py", 300), ], "per-commit-8-gpu-h200": [ TestFile("test_deepseek_v3_basic.py", 275), @@ -149,9 +142,6 @@ "__not_in_ci__": [ TestFile("test_release_memory_occupation.py", 200), # Temporarily disabled TestFile("models/test_dummy_grok_models.py"), - TestFile( - "rl/test_update_weights_from_disk.py" - ), # Temporarily disabled, see https://github.com/sgl-project/sglang/pull/13998 TestFile("test_bench_one_batch.py"), TestFile("test_bench_serving.py"), TestFile("test_eval_accuracy_large.py"), @@ -195,9 +185,6 @@ TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85), TestFile("openai_server/validation/test_request_length_validation.py", 31), TestFile("ops/test_repeat_interleave.py", 75), - # quant tests moved to test/registered/quant/ - TestFile("rl/test_fp32_lm_head.py", 15), - # TestFile("rl/test_update_weights_from_disk.py", 210), # Temporarily disabled, see https://github.com/sgl-project/sglang/pull/13998 TestFile("rotary_embedding/test_mrope.py", 15), TestFile("test_abort.py", 51), TestFile("test_bench_typebaseddispatcher.py", 10), @@ -246,11 +233,8 @@ TestFile("test_gpt_oss_1gpu.py", 750), ], "per-commit-2-gpu-amd": [ - # TestFile("lora/test_lora_tp.py", 116), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/13107. Moved to test/registered/lora/ - TestFile("rl/test_update_weights_from_distributed.py", 103), TestFile("test_data_parallelism.py", 73), TestFile("test_load_weights_from_remote_instance.py", 72), - # TestFile("test_patch_torch.py", 19), # Disabled temporarily, see https://github.com/sgl-project/sglang/issues/11127 ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150),