Skip to content

Commit a911347

Browse files
pamelap-nvidiadominicshanshan
authored andcommitted
[https://nvbugs/5448442][fix] Skip trtllm moe backend for sm120 (NVIDIA#7010)
Signed-off-by: Pamela <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent df1a725 commit a911347

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import torch
44
from torch import nn
55

6+
from tensorrt_llm._utils import get_sm_version
7+
68
from ...model_config import ModelConfig
79
from ...utils import Fp4QuantizedTensor, next_positive_power_of_2
810
from .interface import MoE, MoEWeightLoadingMode
@@ -78,6 +80,11 @@ def __init__(
7880
swiglu_limit=swiglu_limit,
7981
)
8082

83+
sm_version = get_sm_version()
84+
if sm_version >= 120:
85+
raise NotImplementedError(
86+
"TRTLLMGenFusedMoE does not support SM120 and above.")
87+
8188
assert not self.smart_router, "Smart router is not supported in TRTLLMGenFusedMoE."
8289

8390
self.num_slots = self.num_experts

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,6 +1565,11 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
15651565
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
15661566
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
15671567
torch_compile, mtp_nextn, moe_backend):
1568+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1569+
or get_sm_version() == 121):
1570+
pytest.skip(
1571+
"MOE TRTLLM backend does not support SM version 120 or 121")
1572+
15681573
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
15691574
torch_compile_config = TorchCompileConfig(
15701575
enable_fullgraph=True,
@@ -1613,8 +1618,10 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
16131618
torch_compile, mtp_nextn, moe_backend):
16141619
if torch_compile and pp_size > 1:
16151620
pytest.skip("PP with torch.compile is not supported yet.")
1616-
if moe_backend == "TRTLLM" and get_sm_version() == 120:
1617-
pytest.skip("MOE TRTLLM backend does not support SM version 120")
1621+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1622+
or get_sm_version() == 121):
1623+
pytest.skip(
1624+
"MOE TRTLLM backend does not support SM version 120 or 121")
16181625
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
16191626
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
16201627
torch_compile_config = TorchCompileConfig(
@@ -1885,6 +1892,11 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
18851892
def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
18861893
attention_dp, cuda_graph, overlap_scheduler,
18871894
max_batch_size, moe_backend):
1895+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1896+
or get_sm_version() == 121):
1897+
pytest.skip(
1898+
"MOE TRTLLM backend does not support SM version 120 or 121")
1899+
18881900
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
18891901
pytorch_config = dict(
18901902
disable_overlap_scheduler=not overlap_scheduler,
@@ -2509,6 +2521,11 @@ def test_nvfp4(
25092521
torch_compile,
25102522
):
25112523

2524+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
2525+
or get_sm_version() == 121):
2526+
pytest.skip(
2527+
"MOE TRTLLM backend does not support SM version 120 or 121")
2528+
25122529
torch_compile_config = TorchCompileConfig(
25132530
enable_fullgraph=True,
25142531
enable_piecewise_cuda_graph=cuda_graph,
@@ -2700,6 +2717,11 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
27002717
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
27012718
overlap_scheduler, moe_backend, eagle3):
27022719

2720+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
2721+
or get_sm_version() == 121):
2722+
pytest.skip(
2723+
"MOE TRTLLM backend does not support SM version 120 or 121")
2724+
27032725
pytorch_config = dict(
27042726
disable_overlap_scheduler=not overlap_scheduler,
27052727
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,

0 commit comments

Comments
 (0)