From f9c8223fe7aad2c287dd07067f038abb14a0a0e6 Mon Sep 17 00:00:00 2001 From: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> Date: Wed, 26 Nov 2025 10:40:13 -0800 Subject: [PATCH] iAutoDeploy: remove auttuner from nvfp4_gemm forward The autotuner is used when AD record cuda-graphs, and per-shape tactics are cached. The operator should not explicitly use the auto-tuner context. Signed-off-by: Neta Zmora <96238833+nzmora-nvidia@users.noreply.github.com> --- tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py index d892cf6417b..7c1dbc6f0ae 100644 --- a/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py +++ b/tensorrt_llm/_torch/auto_deploy/custom_ops/quant.py @@ -7,8 +7,6 @@ from flashinfer import bmm_fp8 from torch import nn -from tensorrt_llm._torch.autotuner import autotune - from ..distributed import common as dist from ..distributed import trtllm as trtllm_dist from .torch_libs.float8_python_api import addmm_float8_unwrapped @@ -336,10 +334,9 @@ def nvfp4_linear( x_fp4, x_sf_block = torch.ops.trtllm.fp4_quantize( input, input_scale, TRTLLM_NVFP4_SCALING_VECTOR_SIZE, False ) - with autotune(): - output = torch.ops.trtllm.nvfp4_gemm( - x_fp4, weight_fp4, x_sf_block, weight_scale, alpha, input.dtype - ) + output = torch.ops.trtllm.nvfp4_gemm( + x_fp4, weight_fp4, x_sf_block, weight_scale, alpha, input.dtype + ) if bias is not None: output = output + bias