From 5bd319985069b9ea4d286db0c54d958e6448c737 Mon Sep 17 00:00:00 2001 From: michaelzhang-ai Date: Sat, 21 Feb 2026 01:41:45 -0600 Subject: [PATCH 01/10] [AMD] Fix pre-existing AMD CI test failures - Relax LoRA multi-batch ROUGE-L tolerance from 1.0 to 0.95 to account for minor numerical non-determinism on ROCm - Fix aiter attention backend crashing on hybrid Mamba+attention models (e.g. LFM2-MoE): use get_v_head_dim() for hybrid KV pools instead of hardcoded get_value_buffer(0) which fails when layer 0 is not an attention layer - Skip TestToolChoiceLfm2Moe on AMD: sgl_kernel ROCm build lacks causal_conv1d_update op needed by Mamba layers --- python/sglang/srt/layers/attention/aiter_backend.py | 7 ++++++- python/sglang/test/lora_utils.py | 1 + .../openai_server/function_call/test_tool_choice.py | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py index d851040cfc37..dd4879bba3df 100644 --- a/python/sglang/srt/layers/attention/aiter_backend.py +++ b/python/sglang/srt/layers/attention/aiter_backend.py @@ -123,7 +123,12 @@ def __init__( model_runner.model_config.num_attention_heads // get_attention_tp_size() ) self.head_dim = model_runner.model_config.head_dim - self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1] + if hasattr(model_runner.token_to_kv_pool, "get_v_head_dim"): + self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim() + else: + self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[ + -1 + ] self.num_kv_head = model_runner.model_config.get_num_kv_heads( get_attention_tp_size() ) diff --git a/python/sglang/test/lora_utils.py b/python/sglang/test/lora_utils.py index b1181f248c5b..28eb6fff8068 100644 --- a/python/sglang/test/lora_utils.py +++ b/python/sglang/test/lora_utils.py @@ -95,6 +95,7 @@ def __post_init__(self): prefill_tolerance=3e-1, ), ], + rouge_l_tolerance=0.95, max_loras_per_batch=2, max_loaded_loras=4, ), diff --git a/test/registered/openai_server/function_call/test_tool_choice.py b/test/registered/openai_server/function_call/test_tool_choice.py index fd6039b3dcc9..d8aec8349cc2 100644 --- a/test/registered/openai_server/function_call/test_tool_choice.py +++ b/test/registered/openai_server/function_call/test_tool_choice.py @@ -889,6 +889,9 @@ def setUpClass(cls): cls.tokenizer = get_tokenizer(cls.model) +@unittest.skipIf( + is_hip(), "sgl_kernel ROCm build lacks causal_conv1d_update for Mamba layers" +) class TestToolChoiceLfm2Moe(TestToolChoiceLlama32): """Test tool_choice functionality with LiquidAI LFM2-MoE model""" From 7f950076853925136599fbb33d51a1b786d77652 Mon Sep 17 00:00:00 2001 From: bingxche Date: Wed, 25 Feb 2026 08:50:25 +0000 Subject: [PATCH 02/10] Fix LFM2 (Mamba) model on ROCm by falling back to Triton kernels when sgl_kernel causal_conv1d ops are unavailable --- .../layers/attention/mamba/causal_conv1d.py | 46 ++++++++++++++++++- .../function_call/test_tool_choice.py | 6 +-- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py index 071a0ee6f749..e7beb75b0051 100644 --- a/python/sglang/srt/layers/attention/mamba/causal_conv1d.py +++ b/python/sglang/srt/layers/attention/mamba/causal_conv1d.py @@ -7,11 +7,23 @@ from typing import Optional import torch -from sgl_kernel import causal_conv1d_fwd -from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel from .causal_conv1d_triton import PAD_SLOT_ID +try: + from sgl_kernel import causal_conv1d_fwd + from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel + + torch.ops.sgl_kernel.causal_conv1d_update + _USE_TRITON = False +except (ImportError, AttributeError): + from .causal_conv1d_triton import causal_conv1d_fn as _causal_conv1d_fn_triton + from .causal_conv1d_triton import ( + causal_conv1d_update as _causal_conv1d_update_triton, + ) + + _USE_TRITON = True + def causal_conv1d_fn( x: torch.Tensor, @@ -54,6 +66,25 @@ def causal_conv1d_fn( out: (batch, dim, seqlen) """ + if _USE_TRITON: + seq_lens_cpu = ( + (query_start_loc[1:] - query_start_loc[:-1]).cpu().tolist() + if query_start_loc is not None + else [x.shape[-1]] + ) + return _causal_conv1d_fn_triton( + x, + weight, + bias, + conv_states=conv_states, + query_start_loc=query_start_loc, + seq_lens_cpu=seq_lens_cpu, + cache_indices=cache_indices, + has_initial_state=has_initial_state, + activation=activation, + pad_slot_id=pad_slot_id, + **kwargs, + ) if activation not in [None, "silu", "swish"]: raise NotImplementedError("activation must be None, silu, or swish") if x.stride(-1) != 1: @@ -106,6 +137,17 @@ def causal_conv1d_update( indices 0 and 3 out: (batch, dim) or (batch, dim, seqlen) """ + if _USE_TRITON: + return _causal_conv1d_update_triton( + x, + conv_state, + weight, + bias=bias, + activation=activation, + cache_seqlens=cache_seqlens, + conv_state_indices=conv_state_indices, + pad_slot_id=pad_slot_id, + ) if activation not in [None, "silu", "swish"]: raise NotImplementedError( f"activation must be None, silu, or swish, actual: {activation}" diff --git a/test/registered/openai_server/function_call/test_tool_choice.py b/test/registered/openai_server/function_call/test_tool_choice.py index d8aec8349cc2..d463e8bbf076 100644 --- a/test/registered/openai_server/function_call/test_tool_choice.py +++ b/test/registered/openai_server/function_call/test_tool_choice.py @@ -12,7 +12,7 @@ import openai -from sglang.srt.utils import is_hip, kill_process_tree +from sglang.srt.utils import kill_process_tree from sglang.srt.utils.hf_transformers_utils import get_tokenizer from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci from sglang.test.test_utils import ( @@ -860,7 +860,6 @@ def test_complex_parameters_required_non_streaming(self): # cls.tokenizer = get_tokenizer(cls.model) -@unittest.skipIf(is_hip(), "Disabled for AMD") class TestToolChoiceLfm2(TestToolChoiceLlama32): """Test tool_choice functionality with LiquidAI LFM2 model""" @@ -889,9 +888,6 @@ def setUpClass(cls): cls.tokenizer = get_tokenizer(cls.model) -@unittest.skipIf( - is_hip(), "sgl_kernel ROCm build lacks causal_conv1d_update for Mamba layers" -) class TestToolChoiceLfm2Moe(TestToolChoiceLlama32): """Test tool_choice functionality with LiquidAI LFM2-MoE model""" From 2722b9e366ea4f9f6f3153c8df0689ac37fbc39b Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Wed, 25 Feb 2026 03:59:18 -0600 Subject: [PATCH 03/10] fix(amd-ci): retry near-miss ROUGE-L flaky failures in lora test --- .../lora/test_multi_lora_backend.py | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index f34b9e622aa6..4f7444674457 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -14,6 +14,7 @@ import multiprocessing as mp import os +import re import unittest from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci @@ -23,13 +24,45 @@ run_lora_batch_splitting_equivalence_test, run_lora_multiple_batch_on_model_cases, ) -from sglang.test.test_utils import CustomTestCase, is_in_ci +from sglang.test.test_utils import CustomTestCase, is_in_amd_ci, is_in_ci register_cuda_ci(est_time=100, suite="stage-b-test-large-1-gpu") -register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd") +register_amd_ci(est_time=200, suite="stage-b-test-small-1-gpu-amd") class TestMultiLoRABackend(CustomTestCase): + def _callTestMethod(self, method): + from sglang.srt.environ import envs + from sglang.srt.utils.common import retry + + max_retry = envs.SGLANG_TEST_MAX_RETRY.get() + if max_retry is None: + max_retry = 1 if is_in_ci() else 0 + + if is_in_amd_ci(): + attempt_count = [0] + + def should_retry(e): + attempt_count[0] += 1 + match = re.search(r"ROUGE-L score ([\d.]+)", str(e)) + if match: + score = float(match.group(1)) + if score < 0.977: + return False + return True + return attempt_count[0] <= max_retry + + retry( + lambda: super(CustomTestCase, self)._callTestMethod(method), + max_retry=max_retry + 2, + should_retry=should_retry, + ) + else: + retry( + lambda: super(CustomTestCase, self)._callTestMethod(method), + max_retry=max_retry, + ) + def test_ci_lora_models_batch_splitting(self): run_lora_batch_splitting_equivalence_test(CI_MULTI_LORA_MODELS) From 0b31772e333120a7160a0ff8e923d0582be33313 Mon Sep 17 00:00:00 2001 From: YC Tseng Date: Wed, 25 Feb 2026 18:01:52 +0800 Subject: [PATCH 04/10] revert the threshold change in lora_uitls.py --- python/sglang/test/lora_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/sglang/test/lora_utils.py b/python/sglang/test/lora_utils.py index 28eb6fff8068..b1181f248c5b 100644 --- a/python/sglang/test/lora_utils.py +++ b/python/sglang/test/lora_utils.py @@ -95,7 +95,6 @@ def __post_init__(self): prefill_tolerance=3e-1, ), ], - rouge_l_tolerance=0.95, max_loras_per_batch=2, max_loaded_loras=4, ), From 39d1391e7f2e668389f581651d9d24cbe9844636 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Wed, 25 Feb 2026 09:31:03 -0600 Subject: [PATCH 05/10] increase the additional retry limit --- test/registered/lora/test_multi_lora_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index 4f7444674457..7de87966a698 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -54,7 +54,7 @@ def should_retry(e): retry( lambda: super(CustomTestCase, self)._callTestMethod(method), - max_retry=max_retry + 2, + max_retry=max_retry + 3, should_retry=should_retry, ) else: From 23f8e58a863266a474a202ebf0727d195880280f Mon Sep 17 00:00:00 2001 From: michaelzhang-ai Date: Wed, 25 Feb 2026 16:20:48 -0600 Subject: [PATCH 06/10] [AMD] Disable aiter RoPE in LoRA test for deterministic outputs The aiter RoPE backend has lower precision (as warned by apex), causing consistent single-token differences between SRT and HF reference outputs (ROUGE-L 0.9774 vs required 1.0). Disable it for the LoRA multi-batch test to produce exact matches. --- test/registered/lora/test_multi_lora_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index 7de87966a698..f6dc0ba5bb17 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -17,7 +17,11 @@ import re import unittest +from sglang.srt.utils import is_hip from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci + +if is_hip(): + os.environ.setdefault("USE_ROCM_AITER_ROPE_BACKEND", "0") from sglang.test.lora_utils import ( ALL_OTHER_MULTI_LORA_MODELS, CI_MULTI_LORA_MODELS, From 094f05daf32c27401d755fa6dbd39b00a26d5efe Mon Sep 17 00:00:00 2001 From: michaelzhang-ai Date: Wed, 25 Feb 2026 20:20:37 -0600 Subject: [PATCH 07/10] [AMD] Fix aiter backend v_head_dim for hybrid Mamba+attention models The existing check only covers hybrid_gdn_config and kimi_linear_config, but LFM2 models use HybridLinearKVPool without either config. Use hasattr(get_v_head_dim) to cover all hybrid KV pool types, matching triton_backend.py. --- python/sglang/srt/layers/attention/aiter_backend.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/sglang/srt/layers/attention/aiter_backend.py b/python/sglang/srt/layers/attention/aiter_backend.py index 0062a55737fa..e86a178b1c90 100755 --- a/python/sglang/srt/layers/attention/aiter_backend.py +++ b/python/sglang/srt/layers/attention/aiter_backend.py @@ -139,11 +139,9 @@ def __init__( if self.use_mla: # For MLA models, get v_head_dim from model config self.v_head_dim = model_runner.model_config.v_head_dim - elif ( - model_runner.hybrid_gdn_config is not None - or model_runner.kimi_linear_config is not None - ): - # For hybrid linear models, layer_id = 0 may not be full attention + elif hasattr(model_runner.token_to_kv_pool, "get_v_head_dim"): + # For hybrid models (Mamba+attention, GDN, Kimi linear), + # layer_id=0 may not be a full attention layer self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim() else: self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[ From eba80754e7ef340820d3537bb53a86da684d18c8 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Wed, 25 Feb 2026 22:36:42 -0600 Subject: [PATCH 08/10] set SGLANG_USE_AITER = 0 --- test/registered/lora/test_multi_lora_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index f6dc0ba5bb17..cc8efde6d953 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -21,7 +21,8 @@ from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci if is_hip(): - os.environ.setdefault("USE_ROCM_AITER_ROPE_BACKEND", "0") + os.environ.setdefault("SGLANG_USE_AITER", "0") + from sglang.test.lora_utils import ( ALL_OTHER_MULTI_LORA_MODELS, CI_MULTI_LORA_MODELS, From 46de4f4f2b9ed5e64dd23eca9fd3b89eabd9b2af Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Thu, 26 Feb 2026 20:58:14 -0600 Subject: [PATCH 09/10] create new amd stage-b job --- .github/workflows/pr-test-amd-rocm720.yml | 40 +++++++++++++++++++ .github/workflows/pr-test-amd.yml | 40 +++++++++++++++++++ scripts/ci/utils/slash_command_handler.py | 1 + .../lora/test_multi_lora_backend.py | 2 +- test/run_suite.py | 1 + 5 files changed, 83 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test-amd-rocm720.yml b/.github/workflows/pr-test-amd-rocm720.yml index 42bcad325a07..1c032c6d8712 100644 --- a/.github/workflows/pr-test-amd-rocm720.yml +++ b/.github/workflows/pr-test-amd-rocm720.yml @@ -321,6 +321,45 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 13 --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-aiter-build + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes] if: | @@ -801,6 +840,7 @@ jobs: stage-a-test-1-amd, jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml index 454397c783b8..c19985433d47 100644 --- a/.github/workflows/pr-test-amd.yml +++ b/.github/workflows/pr-test-amd.yml @@ -318,6 +318,45 @@ jobs: run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 + stage-b-test-small-1-gpu-amd-nondeterministic: + needs: [check-changes, stage-a-test-1-amd] + if: | + always() && + ( + (inputs.target_stage == 'stage-b-test-small-1-gpu-amd-nondeterministic') || + ( + !inputs.target_stage && + (!failure() && !cancelled()) && + ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) + ) + ) + strategy: + fail-fast: false + matrix: + runner: [linux-mi325-gpu-1] + runs-on: ${{matrix.runner}} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} + + - name: Ensure VRAM is clear + run: bash scripts/ensure_vram_clear.sh rocm + + - name: Start CI container + run: bash scripts/ci/amd/amd_ci_start_container.sh + env: + GITHUB_WORKSPACE: ${{ github.workspace }} + + - name: Install dependencies + run: bash scripts/ci/amd/amd_ci_install_dependency.sh + + - name: Run test + timeout-minutes: 30 + run: | + bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 --continue-on-error + stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] if: | @@ -890,6 +929,7 @@ jobs: stage-a-test-1-amd, jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, + stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, diff --git a/scripts/ci/utils/slash_command_handler.py b/scripts/ci/utils/slash_command_handler.py index 9e7f98f87c7c..6d2973f066ee 100644 --- a/scripts/ci/utils/slash_command_handler.py +++ b/scripts/ci/utils/slash_command_handler.py @@ -274,6 +274,7 @@ def handle_rerun_stage( "sgl-kernel-unit-test-2-gpu-amd", "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", + "stage-b-test-small-1-gpu-amd-nondeterministic", "stage-b-test-small-1-gpu-amd-mi35x", "stage-b-test-large-1-gpu-amd", "stage-b-test-large-2-gpu-amd", diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index cc8efde6d953..971b9882578f 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -32,7 +32,7 @@ from sglang.test.test_utils import CustomTestCase, is_in_amd_ci, is_in_ci register_cuda_ci(est_time=100, suite="stage-b-test-large-1-gpu") -register_amd_ci(est_time=200, suite="stage-b-test-small-1-gpu-amd") +register_amd_ci(est_time=200, suite="stage-b-test-small-1-gpu-amd-nondeterministic") class TestMultiLoRABackend(CustomTestCase): diff --git a/test/run_suite.py b/test/run_suite.py index 3f2dcc44bb75..2f45522aa9b0 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -21,6 +21,7 @@ HWBackend.AMD: [ "stage-a-test-1-amd", "stage-b-test-small-1-gpu-amd", + "stage-b-test-small-1-gpu-amd-nondeterministic", "stage-b-test-small-1-gpu-amd-mi35x", "stage-b-test-large-8-gpu-35x-disaggregation-amd", "stage-b-test-large-1-gpu-amd", From c5a0e6364815292646ab28001db747f9c59874a8 Mon Sep 17 00:00:00 2001 From: YC Tseng Date: Fri, 27 Feb 2026 11:27:44 +0800 Subject: [PATCH 10/10] Update test_multi_lora_backend.py for AMD CI --- .../lora/test_multi_lora_backend.py | 42 +------------------ 1 file changed, 2 insertions(+), 40 deletions(-) diff --git a/test/registered/lora/test_multi_lora_backend.py b/test/registered/lora/test_multi_lora_backend.py index 971b9882578f..9a7465d45dbe 100644 --- a/test/registered/lora/test_multi_lora_backend.py +++ b/test/registered/lora/test_multi_lora_backend.py @@ -14,60 +14,22 @@ import multiprocessing as mp import os -import re import unittest -from sglang.srt.utils import is_hip from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci - -if is_hip(): - os.environ.setdefault("SGLANG_USE_AITER", "0") - from sglang.test.lora_utils import ( ALL_OTHER_MULTI_LORA_MODELS, CI_MULTI_LORA_MODELS, run_lora_batch_splitting_equivalence_test, run_lora_multiple_batch_on_model_cases, ) -from sglang.test.test_utils import CustomTestCase, is_in_amd_ci, is_in_ci +from sglang.test.test_utils import CustomTestCase, is_in_ci register_cuda_ci(est_time=100, suite="stage-b-test-large-1-gpu") -register_amd_ci(est_time=200, suite="stage-b-test-small-1-gpu-amd-nondeterministic") +register_amd_ci(est_time=100, suite="stage-b-test-small-1-gpu-amd-nondeterministic") class TestMultiLoRABackend(CustomTestCase): - def _callTestMethod(self, method): - from sglang.srt.environ import envs - from sglang.srt.utils.common import retry - - max_retry = envs.SGLANG_TEST_MAX_RETRY.get() - if max_retry is None: - max_retry = 1 if is_in_ci() else 0 - - if is_in_amd_ci(): - attempt_count = [0] - - def should_retry(e): - attempt_count[0] += 1 - match = re.search(r"ROUGE-L score ([\d.]+)", str(e)) - if match: - score = float(match.group(1)) - if score < 0.977: - return False - return True - return attempt_count[0] <= max_retry - - retry( - lambda: super(CustomTestCase, self)._callTestMethod(method), - max_retry=max_retry + 3, - should_retry=should_retry, - ) - else: - retry( - lambda: super(CustomTestCase, self)._callTestMethod(method), - max_retry=max_retry, - ) - def test_ci_lora_models_batch_splitting(self): run_lora_batch_splitting_equivalence_test(CI_MULTI_LORA_MODELS)