sgl-project · HaiShaw · Jan 12, 2026 · Jan 6, 2026 · Jan 11, 2026 · Jan 11, 2026
@@ -224,7 +224,7 @@ jobs:
       - name: Run test
         timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12
+          bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 12 --timeout-per-file 1800
 
   stage-b-test-small-1-gpu-amd-mi35x:
     needs: [check-changes, stage-a-test-1-amd]

diff --git a/test/registered/attention/test_triton_attention_kernels.py b/test/registered/attention/test_triton_attention_kernels.py
@@ -20,15 +20,11 @@
 )
 from sglang.srt.utils import get_device
 from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
-from sglang.test.test_utils import CustomTestCase
+from sglang.test.test_utils import CustomTestCase, is_in_amd_ci
 
 # Triton attention kernel unit tests (decode, extend, prefill)
 register_cuda_ci(est_time=30, suite="stage-b-test-small-1-gpu")
-register_amd_ci(
-    est_time=30,
-    suite="stage-b-test-small-1-gpu-amd",
-    disabled="test was never enabled for AMD CI, needs validation",
-)
+register_amd_ci(est_time=30, suite="stage-b-test-small-1-gpu-amd")
 
 
 def extend_attention_fwd_torch(
@@ -627,7 +623,10 @@ def _test_grouped_decode_attention_once(self, B, S, H_Q, H_KV, D, D_V):
         )
         print(cos_sim.item())
         self.assertTrue(cos_sim.item() > 0.99)
-        self.assertTrue(torch.allclose(o, o_grouped, atol=3e-2))
+        if is_in_amd_ci():
+            self.assertTrue(torch.allclose(o, o_grouped, atol=5e-2))
+        else:
+            self.assertTrue(torch.allclose(o, o_grouped, atol=3e-2))
 
     def test_grouped_decode_attention(self):
         seq_lens = [5, 100, 128, 500]
@@ -764,11 +763,18 @@ def _test_extend_attention_unified_vs_regular_once(self, B, N_CTX, H_Q, H_KV, D)
         )
 
         # Compare results
-        self.assertTrue(
-            torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.15),
-            f"Unified kernel output differs from 2-stage kernel. "
-            f"Max diff: {(o_regular - o_unified).abs().max()}",
-        )
+        if is_in_amd_ci():
+            self.assertTrue(
+                torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.17),
+                f"Unified kernel output differs from 2-stage kernel. "
+                f"Max diff: {(o_regular - o_unified).abs().max()}",
+            )
+        else:
+            self.assertTrue(
+                torch.allclose(o_regular, o_unified, rtol=0.15, atol=0.15),
+                f"Unified kernel output differs from 2-stage kernel. "
+                f"Max diff: {(o_regular - o_unified).abs().max()}",
+            )
 
     def test_extend_attention_unified_vs_regular(self):
         """Test unified kernel matches 2-stage kernel across different configs."""

diff --git a/test/registered/lora/test_lora_qwen3.py b/test/registered/lora/test_lora_qwen3.py
@@ -15,13 +15,18 @@
 import multiprocessing as mp
 import unittest
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.lora_utils import (
     LORA_MODELS_QWEN3,
     run_lora_multiple_batch_on_model_cases,
 )
 
 register_cuda_ci(est_time=97, suite="nightly-1-gpu", nightly=True)
+register_amd_ci(
+    est_time=30,
+    suite="stage-b-test-small-1-gpu-amd",
+    disabled="see https://github.com/sgl-project/sglang/issues/13107",
+)
 
 from sglang.test.test_utils import CustomTestCase
 

diff --git a/test/registered/moe/test_torch_compile_moe.py b/test/registered/moe/test_torch_compile_moe.py
@@ -1,6 +1,7 @@
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 
 register_cuda_ci(est_time=210, suite="stage-b-test-small-1-gpu")
+register_amd_ci(est_time=1400, suite="stage-b-test-small-1-gpu-amd")
 
 import time
 import unittest

diff --git a/test/srt/test_deepseek_r1_mxfp4_8gpu.py b/test/srt/test_deepseek_r1_mxfp4_8gpu.py
@@ -16,7 +16,7 @@
 )
 
 DEEPSEEK_R1_MODEL_PATH = "amd/DeepSeek-R1-MXFP4-Preview"
-SERVER_LAUNCH_TIMEOUT = 1000
+SERVER_LAUNCH_TIMEOUT = 1200
 
 
 class TestDeepseekR1MXFP4(CustomTestCase):