[TRTLLM-4501][feat] Add input tensor pre-hook function API for the tuning process. (NVIDIA#6924)

hyukn · dominicshanshan · commit da0fe555d68d · 2025-11-02T19:10:42.000-08:00
Some tunable ops require a more realistic data distribution, for instance, a shape-associated tensor. Thus, a customizable pre-hook function can be declared in the tuning config to modify the input tensor before the tuning process.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -92,10 +92,13 @@ class TuningConfig:
             any value is provided to the choose_one function, the input tensor will be saturated
             with the provided value.
             If not provided, the autotuner will not consider the max num tokens.
+        inputs_pre_hook (Callable): A function that takes a list of input tensors, returns a list of modified input tensors.
+            It is called before the input tensors are prepared for the tuning process to match the real data distribution.
     """
     dynamic_tensor_specs: Tuple[DynamicTensorSpec, ...] = ()
     constraint_specs: Tuple[ConstraintSpec, ...] = ()
     tune_max_num_tokens: int = None
+    inputs_pre_hook: Callable = None
 
 
 @dataclass(unsafe_hash=True)
@@ -662,6 +665,9 @@ def _profile_runners(
         min_time = float('inf')
         has_tuning_failure_occured = False
         best_runner_id, best_tactic = None, None
+        # If the inputs_pre_hook is provided, it will be called before profiling.
+        if tuning_config.inputs_pre_hook is not None:
+            input_tensors = tuning_config.inputs_pre_hook(input_tensors)
         for runner_id, runner in enumerate(runners):
             # TODO: use FakeTensor here.
             runner_arg_names = {
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -29,8 +29,8 @@ class MoERunner(TunableRunner):
     runner_dict = dict()
     tuning_config = TuningConfig(
         dynamic_tensor_specs=(DynamicTensorSpec(
-            0, 0, get_last_power_of_2_num_tokens_buckets(8192),
-            lambda x: min(last_positive_power_of_2(x), 8192)), ),
+            0, 0, get_last_power_of_2_num_tokens_buckets,
+            last_positive_power_of_2), ),
         tune_max_num_tokens=8192,
     )
 
diff --git a/tests/unittest/_torch/misc/test_autotuner.py b/tests/unittest/_torch/misc/test_autotuner.py
@@ -322,12 +322,24 @@ def test_multiple_dynamic_shapes_cache():
         f"Expected 12 cache entries for 3x4 shape combinations, got {len(cache_entries)}"
 
 
-class GemmRunnerWithTacticConfigs(TunableRunner):
+class GemmRunnerComplexTuningConfigs(TunableRunner):
     valid_tactic_ids = [-1, 0, 1]
+    tune_max_num_tokens = 32
+
+    def get_valid_tactics(
+        self,
+        inputs: List[FakeTensor],
+        profile: OptimizationProfile,
+        **kwargs,
+    ) -> List[Dict[str, int]]:
+        # During the tuning process, we verify if the tuning config behaves as expected
+
+        assert inputs[0].shape[0] <= self.tune_max_num_tokens, \
+            f"Input shape {inputs[0].shape[0]} is larger than the max num tokens {self.tune_max_num_tokens}"
+
+        assert inputs[0][-1, 0] == inputs[0].shape[0], \
+            f"Input shape {inputs[0].shape[0]} is not set through the pre_hook correctly"
 
-    def get_valid_tactics(self, inputs: List[FakeTensor],
-                          profile: OptimizationProfile,
-                          **kwargs) -> List[Dict[str, int]]:
         # The simulated delay is not deterministic, so we need to return specific tactics here
         return [{
             "block_size": block_size,
@@ -350,12 +362,30 @@ def forward(
         assert tactic_id in self.valid_tactic_ids
         return [gemm_0, gemm_1, gemm_fallback][tactic_id](*inputs)
 
+    @staticmethod
+    def inputs_pre_hook(inputs: List[torch.Tensor]):
+        # always set the first element to bo iota in x
+        x, w = inputs
+        x_hooked = torch.zeros_like(x)
+        x_hooked[-1, 0] = x.shape[0]
+        return [x_hooked, w]
+
 
-def test_autotuner_tactic_configs():
-    runner_0 = GemmRunnerWithTacticConfigs()
+def test_autotuner_tuning_configs():
+    runner_0 = GemmRunnerComplexTuningConfigs()
     runners = [runner_0]
     x, w = torch.randn(64, 64), torch.randn(64, 128)
-    tuning_config = TuningConfig()
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            input_idx=0,
+            dim_idx=0,
+            gen_tuning_buckets=get_power_of_2_num_tokens_buckets,
+            map_to_tuning_buckets=next_positive_power_of_2,
+        ), ),
+        # Test if the number of tuning tokens is clipped to 32
+        tune_max_num_tokens=GemmRunnerComplexTuningConfigs.tune_max_num_tokens,
+        inputs_pre_hook=GemmRunnerComplexTuningConfigs.inputs_pre_hook,
+    )
     with autotune():
         tuner = AutoTuner.get()
         runner, tactic = tuner.choose_one("test_autotuner_tactic_configs",