Skip to content

Commit 1e5a6be

Browse files
[https://nvbugs/5448442][fix] Skip trtllm moe backend for sm120 (#7010)
Signed-off-by: Pamela <[email protected]>
1 parent 441edf1 commit 1e5a6be

File tree

3 files changed

+31
-4
lines changed

3 files changed

+31
-4
lines changed

tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import torch
44

5+
from tensorrt_llm._utils import get_sm_version
6+
57
from ...distributed.ops import reducescatter
68
from ...model_config import ModelConfig
79
from ...utils import Fp4QuantizedTensor
@@ -68,6 +70,11 @@ def __init__(
6870
weight_loading_mode=weight_loading_mode,
6971
)
7072

73+
sm_version = get_sm_version()
74+
if sm_version >= 120:
75+
raise NotImplementedError(
76+
"TRTLLMGenFusedMoE does not support SM120 and above.")
77+
7178
assert not self.smart_router, "Smart router is not supported in TRTLLMGenFusedMoE."
7279

7380
self.num_slots = self.num_experts

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,6 +1338,11 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
13381338
@parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
13391339
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
13401340
torch_compile, mtp_nextn, moe_backend):
1341+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1342+
or get_sm_version() == 121):
1343+
pytest.skip(
1344+
"MOE TRTLLM backend does not support SM version 120 or 121")
1345+
13411346
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
13421347
torch_compile_config = TorchCompileConfig(
13431348
enable_fullgraph=True,
@@ -1385,8 +1390,10 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
13851390
torch_compile, mtp_nextn, moe_backend):
13861391
if torch_compile and pp_size > 1:
13871392
pytest.skip("PP with torch.compile is not supported yet.")
1388-
if moe_backend == "TRTLLM" and get_sm_version() == 120:
1389-
pytest.skip("MOE TRTLLM backend does not support SM version 120")
1393+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1394+
or get_sm_version() == 121):
1395+
pytest.skip(
1396+
"MOE TRTLLM backend does not support SM version 120 or 121")
13901397
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
13911398
# Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
13921399
torch_compile_config = TorchCompileConfig(
@@ -1601,6 +1608,11 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
16011608
def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
16021609
attention_dp, cuda_graph, overlap_scheduler,
16031610
max_batch_size, moe_backend):
1611+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
1612+
or get_sm_version() == 121):
1613+
pytest.skip(
1614+
"MOE TRTLLM backend does not support SM version 120 or 121")
1615+
16041616
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
16051617
pytorch_config = dict(
16061618
disable_overlap_scheduler=not overlap_scheduler,
@@ -2157,6 +2169,11 @@ def test_nvfp4(
21572169
torch_compile,
21582170
):
21592171

2172+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
2173+
or get_sm_version() == 121):
2174+
pytest.skip(
2175+
"MOE TRTLLM backend does not support SM version 120 or 121")
2176+
21602177
torch_compile_config = TorchCompileConfig(
21612178
enable_fullgraph=True,
21622179
enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@@ -2277,6 +2294,11 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
22772294
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
22782295
overlap_scheduler, moe_backend, eagle3):
22792296

2297+
if moe_backend == "TRTLLM" and (get_sm_version() == 120
2298+
or get_sm_version() == 121):
2299+
pytest.skip(
2300+
"MOE TRTLLM backend does not support SM version 120 or 121")
2301+
22802302
pytorch_config = dict(
22812303
disable_overlap_scheduler=not overlap_scheduler,
22822304
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,

tests/integration/test_lists/qa/llm_function_rtx6kd.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT
2424
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
2525
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False]
2626
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
27-
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
28-
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
2927
test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
3028
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
3129
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

0 commit comments

Comments
 (0)