diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml index 42bcad325a07..1c032c6d8712 100644 --- a/.github/workflows/pr-test-amd-rocm720.yml +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -321,6 +321,45 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes] if: | @@ -801,6 +840,7 @@ jobs: stage-a-test-1-amd, jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 454397c783b8..c19985433d47 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -318,6 +318,45 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] if: | @@ -890,6 +929,7 @@ jobs: stage-a-test-1-amd, jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py index 0062a55737fa..e86a178b1c90 100755 --- a/python/sglang/srt/layers/attention/aiter_backend.py +++ b/python/sglang/srt/layers/attention/aiter_backend.py @@ -139,11 +139,9 @@ def __init__( if self.use_mla: # For MLA models, get v_head_dim from model config self.v_head_dim = model_runner.model_config.v_head_dim - elif ( - model_runner.hybrid_gdn_config is not None - or model_runner.kimi_linear_config is not None - ): - # For hybrid linear models, layer_id = 0 may not be full attention + elif hasattr(model_runner.token_to_kv_pool, "get_v_head_dim"): + # For hybrid models (Mamba+attention, GDN, Kimi linear), + # layer_id=0 may not be a full attention layer self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim() else: self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[ diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py index 071a0ee6f749..e7beb75b0051 100644 --- a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py +++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py @@ -7,11 +7,23 @@ from typing import Optional import torch -from sgl_kernel import causal_conv1d_fwd -from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel from .causal_conv1d_triton import PAD_SLOT_ID +try: + from sgl_kernel import causal_conv1d_fwd + from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel + + torch.ops.sgl_kernel.causal_conv1d_update + _USE_TRITON = False +except (ImportError, AttributeError): + from .causal_conv1d_triton import causal_conv1d_fn as _causal_conv1d_fn_triton + from .causal_conv1d_triton import ( + causal_conv1d_update as _causal_conv1d_update_triton, + ) + + _USE_TRITON = True + def causal_conv1d_fn( x: torch.Tensor, @@ -54,6 +66,25 @@ def causal_conv1d_fn( out: (batch, dim, seqlen) """ + if _USE_TRITON: + seq_lens_cpu = ( + (query_start_loc[1:] - query_start_loc[:-1]).cpu().tolist() + if query_start_loc is not None + else [x.shape[-1]] + ) + return _causal_conv1d_fn_triton( + x, + weight, + bias, + conv_states=conv_states, + query_start_loc=query_start_loc, + seq_lens_cpu=seq_lens_cpu, + cache_indices=cache_indices, + has_initial_state=has_initial_state, + activation=activation, + pad_slot_id=pad_slot_id, + **kwargs, + ) if activation not in [None, "silu", "swish"]: raise NotImplementedError("activation must be None, silu, or swish") if x.stride(-1) != 1: @@ -106,6 +137,17 @@ def causal_conv1d_update( indices 0 and 3 out: (batch, dim) or (batch, dim, seqlen) """ + if _USE_TRITON: + return _causal_conv1d_update_triton( + x, + conv_state, + weight, + bias=bias, + activation=activation, + cache_seqlens=cache_seqlens, + conv_state_indices=conv_state_indices, + pad_slot_id=pad_slot_id, + ) if activation not in [None, "silu", "swish"]: raise NotImplementedError( f"activation must be None, silu, or swish, actual: {activation}" diff --git a/scripts/ci/utils/slash_command_handler.py b/scripts/ci/utils/slash_command_handler.py index 9e7f98f87c7c..6d2973f066ee 100644 --- a/scripts/ci/utils/slash_command_handler.py +++ b/scripts/ci/utils/slash_command_handler.py @@ -274,6 +274,7 @@ def handle_rerun_stage( "sgl-kernel-unit-test-2-gpu-amd", "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", + "stage-b-test-small-1-gpu-amd-nondeterministic", "stage-b-test-small-1-gpu-amd-mi35x", "stage-b-test-large-1-gpu-amd", "stage-b-test-large-2-gpu-amd", diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index f34b9e622aa6..9a7465d45dbe 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -26,7 +26,7 @@ from sglang.test.test_utils import CustomTestCase, is_in_ci register_cuda_ci(est_time=100, suite="stage-b-test-large-1-gpu") -register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd") +register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd-nondeterministic") class TestMultiLoRABackend(CustomTestCase): diff --git a/test/registered/openai_server/function_call/test_tool_choice.py b/test/registered/openai_server/function_call/test_tool_choice.py index fd6039b3dcc9..d463e8bbf076 100644 --- a/test/registered/openai_server/function_call/test_tool_choice.py +++ b/test/registered/openai_server/function_call/test_tool_choice.py @@ -12,7 +12,7 @@ import openai -from sglang.srt.utils import is_hip, kill_process_tree +from sglang.srt.utils import kill_process_tree from sglang.srt.utils.hf_transformers_utils import get_tokenizer from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci from sglang.test.test_utils import ( @@ -860,7 +860,6 @@ def test_complex_parameters_required_non_streaming(self): # cls.tokenizer = get_tokenizer(cls.model) -@unittest.skipIf(is_hip(), "Disabled for AMD") class TestToolChoiceLfm2(TestToolChoiceLlama32): """Test tool_choice functionality with LiquidAI LFM2 model""" diff --git a/test/run_suite.py b/test/run_suite.py index 3f2dcc44bb75..2f45522aa9b0 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -21,6 +21,7 @@ HWBackend.AMD: [ "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", + "stage-b-test-small-1-gpu-amd-nondeterministic", "stage-b-test-small-1-gpu-amd-mi35x", "stage-b-test-large-8-gpu-35x-disaggregation-amd", "stage-b-test-large-1-gpu-amd",