From f9c8223fe7aad2c287dd07067f038abb14a0a0e6 Mon Sep 17 00:00:00 2001
From: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com>
Date: Wed, 26 Nov 2025 10:40:13 -0800
Subject: [PATCH] iAutoDeploy: remove auttuner from nvfp4_gemm forward

The autotuner is used when AD record cuda-graphs, and per-shape tactics are cached.
The operator should not explicitly use the auto-tuner context.

Signed-off-by: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com>
---
 tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
index d892cf6417b..7c1dbc6f0ae 100644
--- a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
+++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py
@@ -7,8 +7,6 @@
 from flashinfer import bmm_fp8
 from torch import nn
 
-from tensorrt_llm._torch.autotuner import autotune
-
 from ..distributed import common as dist
 from ..distributed import trtllm as trtllm_dist
 from .torch_libs.float8_python_api import addmm_float8_unwrapped
@@ -336,10 +334,9 @@ def nvfp4_linear(
     x_fp4, x_sf_block = torch.ops.trtllm.fp4_quantize(
         input, input_scale, TRTLLM_NVFP4_SCALING_VECTOR_SIZE, False
     )
-    with autotune():
-        output = torch.ops.trtllm.nvfp4_gemm(
-            x_fp4, weight_fp4, x_sf_block, weight_scale, alpha, input.dtype
-        )
+    output = torch.ops.trtllm.nvfp4_gemm(
+        x_fp4, weight_fp4, x_sf_block, weight_scale, alpha, input.dtype
+    )
 
     if bias is not None:
         output = output + bias