diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 52499eeaab7..75f2fcbd28f 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -290,12 +290,46 @@ jobs:
         run: |
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --timeout-per-file 3600
 
+      - name: Run CustomAllReduce test
+        timeout-minutes: 10
+        run: |
+          CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/amd_ci_exec.sh python3 -m unittest test_custom_allreduce.TestCustomAllReduce
+
+  unit-test-sgl-kernel-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi300-gpu-1, linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Start CI container
+        run: bash scripts/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
+          docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
+
   finish:
     if: always()
     needs: [
       accuracy-test-1-gpu-amd, mla-test-1-gpu-amd, bench-test-2-gpu-amd,
       accuracy-test-2-gpu-amd, performance-test-1-gpu-part-1-amd, performance-test-1-gpu-part-2-amd,
-      unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd
+      unit-test-backend-1-gpu-amd, unit-test-backend-2-gpu-amd, unit-test-backend-8-gpu-amd,
+      unit-test-sgl-kernel-amd
     ]
     runs-on: ubuntu-latest
     steps:
diff --git a/scripts/amd_ci_install_dependency.sh b/scripts/amd_ci_install_dependency.sh
index 24ab5b77a8c..3c8061351b3 100755
--- a/scripts/amd_ci_install_dependency.sh
+++ b/scripts/amd_ci_install_dependency.sh
@@ -19,3 +19,4 @@ mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpubli
 docker cp ./dummy-grok ci_sglang:/
 
 docker exec ci_sglang pip install huggingface_hub[hf_xet]
+docker exec ci_sglang pip install pytest
diff --git a/test/srt/test_custom_allreduce.py b/test/srt/test_custom_allreduce.py
index 38600aeabd8..462ac578e0e 100644
--- a/test/srt/test_custom_allreduce.py
+++ b/test/srt/test_custom_allreduce.py
@@ -56,22 +56,30 @@ def multi_process_parallel(
 
 
 class TestCustomAllReduce(CustomTestCase):
+    TEST_SIZES = [
+        512,
+        4096,
+        32768,
+        262144,
+        2097152,
+        16777216,
+        33554432,
+    ]  # 512B...32MB
+    WORLD_SIZES = [2, 4, 6, 8]
+    TEST_LOOP = 10
+
     @classmethod
     def setUpClass(cls):
-        random.seed(42)
-        # 512B to 32MB
-        cls.test_sizes = [512, 4096, 32768, 262144, 2097152, 16777216, 33554432]
-        cls.world_sizes = [2, 4, 6, 8]
-        cls.test_loop = 10
+        random.seed(42)  # keep the deterministic seed
 
     def test_graph_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
             if world_size > torch.cuda.device_count():
                 continue
             multi_process_parallel(world_size, self, self.graph_allreduce)
 
     def test_eager_allreduce(self):
-        for world_size in self.world_sizes:
+        for world_size in self.WORLD_SIZES:
             if world_size > torch.cuda.device_count():
                 continue
             multi_process_parallel(world_size, self, self.eager_allreduce)
@@ -102,9 +110,9 @@ def graph_allreduce(self, world_size, rank, distributed_init_port):
         torch.cuda.synchronize()
         del data
 
-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
             for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                     with graph_capture() as graph_capture_context:
                         # use integers so result matches NCCL exactly
                         inp1 = torch.randint(
@@ -151,9 +159,9 @@ def eager_allreduce(self, world_size, rank, distributed_init_port):
         initialize_model_parallel(tensor_model_parallel_size=world_size)
         group = get_tensor_model_parallel_group().device_group
 
-        for sz in self.test_sizes:
+        for sz in self.TEST_SIZES:
             for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-                for _ in range(self.test_loop):
+                for _ in range(self.TEST_LOOP):
                     inp1 = torch.randint(
                         1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
                     )