diff --git a/tests/unittest/_torch/speculative/test_spec_gate.py b/tests/unittest/_torch/speculative/test_spec_gate.py index ad0d3d3190e..b1720f59233 100644 --- a/tests/unittest/_torch/speculative/test_spec_gate.py +++ b/tests/unittest/_torch/speculative/test_spec_gate.py @@ -5,7 +5,7 @@ import pytest import torch from utils.llm_data import llm_models_root -from utils.util import similar +from utils.util import similar, skip_blackwell from tensorrt_llm import LLM, SamplingParams from tensorrt_llm._torch.speculative.speculation_gate import SpeculationGate @@ -20,6 +20,7 @@ # It is set with acceptance window and acceptance threshold in spec_config. # This test set the max_concurrency to a large value to prevent spec decode turned off due to number of effective requests > max_concurrency, # So that we can only focus on the turning off effect from the SpeculationGate. +@skip_blackwell # TODO: Remove after fixing TRTLLM-GEN FMHA segfault on Blackwell. NVBugs: https://nvbugspro.nvidia.com/bug/5698292 @pytest.mark.high_cuda_memory def test_spec_gate_e2e(): total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9