diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml
index 42bcad325a07..1c032c6d8712 100644
--- a/.github/workflows/pr-test-amd-rocm720.yml
+++ b/.github/workflows/pr-test-amd-rocm720.yml
@@ -321,6 +321,45 @@ jobs:
         run: |
           bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error
 
+  stage-b-test-small-1-gpu-amd-nondeterministic:
+    needs: [check-changes]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error
+
   stage-b-test-small-1-gpu-amd-mi35x:
     needs: [check-changes]
     if: |
@@ -801,6 +840,7 @@ jobs:
         stage-a-test-1-amd,
         jit-kernel-unit-test-amd,
         stage-b-test-small-1-gpu-amd,
+        stage-b-test-small-1-gpu-amd-nondeterministic,
         stage-b-test-small-1-gpu-amd-mi35x,
         stage-b-test-large-1-gpu-amd,
         stage-b-test-large-2-gpu-amd,
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
index 454397c783b8..c19985433d47 100644
--- a/.github/workflows/pr-test-amd.yml
+++ b/.github/workflows/pr-test-amd.yml
@@ -318,6 +318,45 @@ jobs:
         run: |
           bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800
 
+  stage-b-test-small-1-gpu-amd-nondeterministic:
+    needs: [check-changes, stage-a-test-1-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-1]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error
+
   stage-b-test-small-1-gpu-amd-mi35x:
     needs: [check-changes, stage-a-test-1-amd]
     if: |
@@ -890,6 +929,7 @@ jobs:
         stage-a-test-1-amd,
         jit-kernel-unit-test-amd,
         stage-b-test-small-1-gpu-amd,
+        stage-b-test-small-1-gpu-amd-nondeterministic,
         stage-b-test-small-1-gpu-amd-mi35x,
         stage-b-test-large-1-gpu-amd,
         stage-b-test-large-2-gpu-amd,
diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py
index 0062a55737fa..e86a178b1c90 100755
--- a/python/sglang/srt/layers/attention/aiter_backend.py
+++ b/python/sglang/srt/layers/attention/aiter_backend.py
@@ -139,11 +139,9 @@ def __init__(
         if self.use_mla:
             # For MLA models, get v_head_dim from model config
             self.v_head_dim = model_runner.model_config.v_head_dim
-        elif (
-            model_runner.hybrid_gdn_config is not None
-            or model_runner.kimi_linear_config is not None
-        ):
-            # For hybrid linear models, layer_id = 0 may not be full attention
+        elif hasattr(model_runner.token_to_kv_pool, "get_v_head_dim"):
+            # For hybrid models (Mamba+attention, GDN, Kimi linear),
+            # layer_id=0 may not be a full attention layer
             self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim()
         else:
             self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[
diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
index 071a0ee6f749..e7beb75b0051 100644
--- a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
+++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
@@ -7,11 +7,23 @@
 from typing import Optional
 
 import torch
-from sgl_kernel import causal_conv1d_fwd
-from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
 
 from .causal_conv1d_triton import PAD_SLOT_ID
 
+try:
+    from sgl_kernel import causal_conv1d_fwd
+    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
+    torch.ops.sgl_kernel.causal_conv1d_update
+    _USE_TRITON = False
+except (ImportError, AttributeError):
+    from .causal_conv1d_triton import causal_conv1d_fn as _causal_conv1d_fn_triton
+    from .causal_conv1d_triton import (
+        causal_conv1d_update as _causal_conv1d_update_triton,
+    )
+
+    _USE_TRITON = True
+
 
 def causal_conv1d_fn(
     x: torch.Tensor,
@@ -54,6 +66,25 @@ def causal_conv1d_fn(
 
     out: (batch, dim, seqlen)
     """
+    if _USE_TRITON:
+        seq_lens_cpu = (
+            (query_start_loc[1:] - query_start_loc[:-1]).cpu().tolist()
+            if query_start_loc is not None
+            else [x.shape[-1]]
+        )
+        return _causal_conv1d_fn_triton(
+            x,
+            weight,
+            bias,
+            conv_states=conv_states,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=seq_lens_cpu,
+            cache_indices=cache_indices,
+            has_initial_state=has_initial_state,
+            activation=activation,
+            pad_slot_id=pad_slot_id,
+            **kwargs,
+        )
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError("activation must be None, silu, or swish")
     if x.stride(-1) != 1:
@@ -106,6 +137,17 @@ def causal_conv1d_update(
             indices 0 and 3
     out: (batch, dim) or (batch, dim, seqlen)
     """
+    if _USE_TRITON:
+        return _causal_conv1d_update_triton(
+            x,
+            conv_state,
+            weight,
+            bias=bias,
+            activation=activation,
+            cache_seqlens=cache_seqlens,
+            conv_state_indices=conv_state_indices,
+            pad_slot_id=pad_slot_id,
+        )
     if activation not in [None, "silu", "swish"]:
         raise NotImplementedError(
             f"activation must be None, silu, or swish, actual: {activation}"
diff --git a/scripts/ci/utils/slash_command_handler.py b/scripts/ci/utils/slash_command_handler.py
index 9e7f98f87c7c..6d2973f066ee 100644
--- a/scripts/ci/utils/slash_command_handler.py
+++ b/scripts/ci/utils/slash_command_handler.py
@@ -274,6 +274,7 @@ def handle_rerun_stage(
         "sgl-kernel-unit-test-2-gpu-amd",
         "stage-a-test-1-amd",
         "stage-b-test-small-1-gpu-amd",
+        "stage-b-test-small-1-gpu-amd-nondeterministic",
         "stage-b-test-small-1-gpu-amd-mi35x",
         "stage-b-test-large-1-gpu-amd",
         "stage-b-test-large-2-gpu-amd",
diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py
index f34b9e622aa6..9a7465d45dbe 100644
--- a/test/registered/lora/test_multi_lora_backend.py
+++ b/test/registered/lora/test_multi_lora_backend.py
@@ -26,7 +26,7 @@
 from sglang.test.test_utils import CustomTestCase, is_in_ci
 
 register_cuda_ci(est_time=100, suite="stage-b-test-large-1-gpu")
-register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd")
+register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd-nondeterministic")
 
 
 class TestMultiLoRABackend(CustomTestCase):
diff --git a/test/registered/openai_server/function_call/test_tool_choice.py b/test/registered/openai_server/function_call/test_tool_choice.py
index fd6039b3dcc9..d463e8bbf076 100644
--- a/test/registered/openai_server/function_call/test_tool_choice.py
+++ b/test/registered/openai_server/function_call/test_tool_choice.py
@@ -12,7 +12,7 @@
 
 import openai
 
-from sglang.srt.utils import is_hip, kill_process_tree
+from sglang.srt.utils import kill_process_tree
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
@@ -860,7 +860,6 @@ def test_complex_parameters_required_non_streaming(self):
 #         cls.tokenizer = get_tokenizer(cls.model)
 
 
-@unittest.skipIf(is_hip(), "Disabled for AMD")
 class TestToolChoiceLfm2(TestToolChoiceLlama32):
     """Test tool_choice functionality with LiquidAI LFM2 model"""
 
diff --git a/test/run_suite.py b/test/run_suite.py
index 3f2dcc44bb75..2f45522aa9b0 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -21,6 +21,7 @@
     HWBackend.AMD: [
         "stage-a-test-1-amd",
         "stage-b-test-small-1-gpu-amd",
+        "stage-b-test-small-1-gpu-amd-nondeterministic",
         "stage-b-test-small-1-gpu-amd-mi35x",
         "stage-b-test-large-8-gpu-35x-disaggregation-amd",
         "stage-b-test-large-1-gpu-amd",