Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tests/unittest/_torch/speculative/test_spec_gate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
import torch
from utils.llm_data import llm_models_root
from utils.util import similar
from utils.util import similar, skip_blackwell

from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm._torch.speculative.speculation_gate import SpeculationGate
Expand All @@ -20,6 +20,7 @@
# It is set with acceptance window and acceptance threshold in spec_config.
# This test set the max_concurrency to a large value to prevent spec decode turned off due to number of effective requests > max_concurrency,
# So that we can only focus on the turning off effect from the SpeculationGate.
@skip_blackwell # TODO: Remove after fixing TRTLLM-GEN FMHA segfault on Blackwell. NVBugs: https://nvbugspro.nvidia.com/bug/5698292
@pytest.mark.high_cuda_memory
def test_spec_gate_e2e():
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
Expand Down