From 377edb4cd6a182be94347bd20f48f171062b990d Mon Sep 17 00:00:00 2001 From: Yukun He <23156053+hyukn@users.noreply.github.com> Date: Thu, 17 Apr 2025 06:17:44 +0000 Subject: [PATCH] Fix fused_moe fallback issue. min_latency_mode is only set to False during warmup phase. Thus when it becomes true during inference, all tactics fall back to the default one and thus cause perf regression. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com> --- tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index 421d32bfbbc..dda4bcc3283 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -126,7 +126,9 @@ def fused_moe( (2, 0, ((0, ), lambda x: x)), )) - min_latency_tensor = torch.empty(1) if min_latency_mode else torch.empty(0) + # TODO: set min_latency_mode always to False due to the error in the moe_kernels + min_latency_tensor = torch.empty(0) + # allocate workspace for profiling moe_runner = MoERunner( x_dtype=input.dtype,