From 377edb4cd6a182be94347bd20f48f171062b990d Mon Sep 17 00:00:00 2001
From: Yukun He <23156053+hyukn@users.noreply.github.com>
Date: Thu, 17 Apr 2025 06:17:44 +0000
Subject: [PATCH] Fix fused_moe fallback issue. min_latency_mode is only set to
 False during warmup phase. Thus when it becomes true during inference, all
 tactics fall back to the default one and thus cause perf regression.

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
---
 tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index 421d32bfbbc..dda4bcc3283 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -126,7 +126,9 @@ def fused_moe(
         (2, 0, ((0, ), lambda x: x)),
     ))
 
-    min_latency_tensor = torch.empty(1) if min_latency_mode else torch.empty(0)
+    # TODO: set min_latency_mode always to False due to the error in the moe_kernels
+    min_latency_tensor = torch.empty(0)
+
     # allocate workspace for profiling
     moe_runner = MoERunner(
         x_dtype=input.dtype,