diff --git a/test/srt/models/test_nvidia_nemotron_nano_v2.py b/test/srt/models/test_nvidia_nemotron_nano_v2.py index e29be59d3338..ab8c2abdc096 100644 --- a/test/srt/models/test_nvidia_nemotron_nano_v2.py +++ b/test/srt/models/test_nvidia_nemotron_nano_v2.py @@ -24,6 +24,10 @@ class TestNvidiaNemotronNanoV2NVFP4(GSM8KMixin, CustomTestCase): other_args = ["--max-mamba-cache-size", "256"] +@unittest.skip( + "STANDALONE speculative decoding does not yet support target and draft models " + "with different hidden sizes (Nemotron-9B: 4480, Llama-3.2-1B: 2048)" +) class TestNvidiaNemotronNanoV2SpeculativeDecoding(GSM8KMixin, CustomTestCase): accuracy = 0.87 model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2" @@ -49,6 +53,10 @@ class TestNvidiaNemotronNanoV2SpeculativeDecoding(GSM8KMixin, CustomTestCase): ] +@unittest.skip( + "STANDALONE speculative decoding does not yet support target and draft models " + "with different hidden sizes (Nemotron-9B: 4480, Llama-3.2-1B: 2048)" +) class TestNvidiaNemotronNanoV2SpeculativeDecodingBF16Cache(GSM8KMixin, CustomTestCase): accuracy = 0.87 model = "nvidia/NVIDIA-Nemotron-Nano-9B-v2"