diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index c3e6d7899e2e..2724f612cee0 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -8,6 +8,7 @@ from tests.models.registry import HF_EXAMPLE_MODELS from tests.utils import multi_gpu_test from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher @@ -577,6 +578,10 @@ def test_apc_multiple_prompts_all_cached_outputs( model, max_model_len, tensor_parallel_size=tensor_parallel_size ) vllm_runner_kwargs["mamba_ssm_cache_dtype"] = "float32" + # Reduce the effects of batch variance on ROCm since batch invariance is not + # yet supported. See: https://github.com/vllm-project/vllm/issues/27433 + if current_platform.is_rocm(): + vllm_runner_kwargs["max_num_seqs"] = 4 vllm_outputs_no_cache, _ = _get_vLLM_output( vllm_runner, vllm_runner_kwargs, generated_prompts, max_tokens, num_logprobs diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 134e1dfd6283..adc643c385e1 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -214,6 +214,12 @@ def _ssm_transform( time_step = self.dt_layernorm(time_step.contiguous()) B = self.b_layernorm(B.contiguous()) C = self.c_layernorm(C.contiguous()) + + # ROCm: tensor from split is non-contiguous, causing incorrect + # GEMM results in dt_proj. + if current_platform.is_rocm(): + time_step = time_step.contiguous() + discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) return discrete_time_step, B, C