diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index e1b7bc2e038a..5dd390059ea8 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -224,7 +224,7 @@ jobs: - name: Run test timeout-minutes: 30 run: | - bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12 + bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12 --timeout-per-file 1800 stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] diff --git a/test/registered/attention/test_triton_attention_kernels.py b/test/registered/attention/test_triton_attention_kernels.py index fede8457b44e..9b73e84ce09d 100644 --- a/test/registered/attention/test_triton_attention_kernels.py +++ b/test/registered/attention/test_triton_attention_kernels.py @@ -20,15 +20,11 @@ ) from sglang.srt.utils import get_device from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci -from sglang.test.test_utils import CustomTestCase +from sglang.test.test_utils import CustomTestCase, is_in_amd_ci # Triton attention kernel unit tests (decode, extend, prefill) register_cuda_ci(est_time=30, suite="stage-b-test-small-1-gpu") -register_amd_ci( - est_time=30, - suite="stage-b-test-small-1-gpu-amd", - disabled="test was never enabled for AMD CI, needs validation", -) +register_amd_ci(est_time=30, suite="stage-b-test-small-1-gpu-amd") def extend_attention_fwd_torch( @@ -627,7 +623,10 @@ def _test_grouped_decode_attention_once(self, B, S, H_Q, H_KV, D, D_V): ) print(cos_sim.item()) self.assertTrue(cos_sim.item() > 0.99) - self.assertTrue(torch.allclose(o, o_grouped, atol=3e-2)) + if is_in_amd_ci(): + self.assertTrue(torch.allclose(o, o_grouped, atol=5e-2)) + else: + self.assertTrue(torch.allclose(o, o_grouped, atol=3e-2)) def test_grouped_decode_attention(self): seq_lens = [5, 100, 128, 500] @@ -764,11 +763,18 @@ def _test_extend_attention_unified_vs_regular_once(self, B, N_CTX, H_Q, H_KV, D) ) # Compare results - self.assertTrue( - torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.15), - f"Unified kernel output differs from 2-stage kernel. " - f"Max diff: {(o_regular - o_unified).abs().max()}", - ) + if is_in_amd_ci(): + self.assertTrue( + torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.17), + f"Unified kernel output differs from 2-stage kernel. " + f"Max diff: {(o_regular - o_unified).abs().max()}", + ) + else: + self.assertTrue( + torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.15), + f"Unified kernel output differs from 2-stage kernel. " + f"Max diff: {(o_regular - o_unified).abs().max()}", + ) def test_extend_attention_unified_vs_regular(self): """Test unified kernel matches 2-stage kernel across different configs.""" diff --git a/test/registered/lora/test_lora_qwen3.py b/test/registered/lora/test_lora_qwen3.py index 912c60cf152b..98a82ce74e38 100644 --- a/test/registered/lora/test_lora_qwen3.py +++ b/test/registered/lora/test_lora_qwen3.py @@ -15,13 +15,18 @@ import multiprocessing as mp import unittest -from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci from sglang.test.lora_utils import ( LORA_MODELS_QWEN3, run_lora_multiple_batch_on_model_cases, ) register_cuda_ci(est_time=97, suite="nightly-1-gpu", nightly=True) +register_amd_ci( + est_time=30, + suite="stage-b-test-small-1-gpu-amd", + disabled="see https://github.com/sgl-project/sglang/issues/13107", +) from sglang.test.test_utils import CustomTestCase diff --git a/test/registered/moe/test_torch_compile_moe.py b/test/registered/moe/test_torch_compile_moe.py index bcfac3270377..d7ba691a80cf 100644 --- a/test/registered/moe/test_torch_compile_moe.py +++ b/test/registered/moe/test_torch_compile_moe.py @@ -1,6 +1,7 @@ -from sglang.test.ci.ci_register import register_cuda_ci +from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci register_cuda_ci(est_time=210, suite="stage-b-test-small-1-gpu") +register_amd_ci(est_time=1400, suite="stage-b-test-small-1-gpu-amd") import time import unittest diff --git a/test/srt/test_deepseek_r1_mxfp4_8gpu.py b/test/srt/test_deepseek_r1_mxfp4_8gpu.py index fc825b9b4cc4..986c5047648d 100644 --- a/test/srt/test_deepseek_r1_mxfp4_8gpu.py +++ b/test/srt/test_deepseek_r1_mxfp4_8gpu.py @@ -16,7 +16,7 @@ ) DEEPSEEK_R1_MODEL_PATH = "amd/DeepSeek-R1-MXFP4-Preview" -SERVER_LAUNCH_TIMEOUT = 1000 +SERVER_LAUNCH_TIMEOUT = 1200 class TestDeepseekR1MXFP4(CustomTestCase):