Merge branch 'release/1.1' into user/yunruis/fix_bug_5606268

yunruis · web-flow · commit 5205cfe1ee9e · 2025-10-31T15:02:33.000+08:00
diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu
@@ -131,10 +131,6 @@ void moeSetSignalForCpuStageForTest(MoeLoadBalanceSingleLayerSignal* signal)
 template <typename TYPE>
 __global__ void zeroExpertTokenCountKernel(MoeLoadBalanceMetaInfo metaInfo, int* const enabled, int* expertTokenCount)
 {
-    if (*enabled == 0)
-    {
-        return;
-    }
     TYPE oldExpertTokenCount = {0};
     int* expertTokenCountPtr = expertTokenCount + metaInfo.expertCount * blockIdx.x;
     TYPE* typedExpertTokenCountPtr = reinterpret_cast<TYPE*>(expertTokenCountPtr);
diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp
@@ -343,7 +343,7 @@ torch::Tensor fp8_block_scaling_bmm(torch::Tensor const& mat1, torch::Tensor con
 
 TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
-    m.def("fp8_block_scaling_gemm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor");
+    m.def("fp8_block_scaling_gemm_impl(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor");
     m.def(
         "fp8_block_scaling_bmm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale, ScalarType? "
         "out_dtype=None) -> Tensor");
@@ -357,7 +357,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
 {
-    m.impl("fp8_block_scaling_gemm", &torch_ext::fp8_block_scaling_gemm);
+    m.impl("fp8_block_scaling_gemm_impl", &torch_ext::fp8_block_scaling_gemm);
     m.impl("fp8_block_scaling_bmm", &torch_ext::fp8_block_scaling_bmm);
     m.impl("fp8_block_scaling_bmm_out", &torch_ext::fp8_block_scaling_bmm_out);
     m.impl("fp8_block_scaling_moe_gemm", &torch_ext::fp8_block_scaling_moe_gemm);
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -127,7 +127,7 @@ COPY cpp cpp
 COPY scripts scripts
 COPY tensorrt_llm tensorrt_llm
 COPY 3rdparty 3rdparty
-COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt ./
+COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt README.md ./
 
 # Create cache directories for pip and ccache
 RUN mkdir -p /root/.cache/pip /root/.cache/ccache
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -536,6 +536,7 @@ def is_nvfp4_output_kernel_available(
 @dataclass(kw_only=True)
 class TrtllmAttentionMetadata(AttentionMetadata):
     workspace: Optional[torch.Tensor] = None
+    cuda_graph_workspace: Optional[torch.Tensor] = None
 
     # TrtllmAttention needs to know the beam width to access to the cache indirection buffer,
     # when beam search is enabled.
@@ -693,6 +694,14 @@ def get_empty_like(like_tensor: torch.Tensor,
                 device='cuda',
                 dtype=torch.int8,
             )
+
+        if self.cuda_graph_workspace is None:
+            self.cuda_graph_workspace = torch.empty(
+                (0, ),
+                device='cuda',
+                dtype=torch.int8,
+            )
+
         if self.kv_cache_manager is not None:
             self.kv_cache_block_offsets = get_empty(
                 [
@@ -1276,8 +1285,9 @@ def forward(
             host_kv_cache_pool_pointers=metadata.host_kv_cache_pool_pointers,
             host_kv_cache_pool_mapping=metadata.host_kv_cache_pool_mapping,
             block_ids_per_seq=metadata.block_ids_per_seq,
-            workspace=metadata.
-            workspace,  # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            # re-enable it, if pass None to it, fp8 mla will encounter invalid cuda free issue.
+            workspace=metadata.workspace
+            if not metadata.is_cuda_graph else metadata.cuda_graph_workspace,
             cache_indirection=metadata.cache_indirection,
             kv_scale_orig_quant=self.kv_scale_orig_quant,
             kv_scale_quant_orig=self.kv_scale_quant_orig,
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -410,22 +410,24 @@ def choose_one(
                         f"[Autotuner] Profiling runner={runners[best_runner_id]}, tactic={best_tactic} for cache_key={cache_key}."
                     )
                 else:
-                    logger.warning(
+                    logger.warning_once(
                         f"[Autotuner] No valid runner/tactic was found for custom_op={custom_op}, input_shapes={input_shapes}. "
                         f"At least one valid (runner, tactic) pair is required. "
                         f"If get_valid_tactics is intended to return empty list, please ensure that this profile is not valid for the custom_op "
-                        f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash."
+                        f"and should not occurs during the inference stage, or fallback tactic is implemented. Otherwise, the the tuning process will crash.",
+                        key=custom_op,
                     )
                 new_tuning_failure_occured = new_tuning_failure_occured or has_tuning_failure_occured
 
         # If failed profiling tactics occurs, log the error.
         if new_tuning_failure_occured:
-            logger.warning(
+            logger.warning_once(
                 f"[Autotuner] New tuning error occurs:"
                 f"Total failed profiling tactics occurs: {len(self.stats.failed_profiling_count[custom_op])} for custom_op={custom_op}. "
                 f"This will not block the tuning process. "
                 f"Please set TLLM_LOG_LEVEL=WARNING to find out when the tactic profiling fails. "
-                f"Set TLLM_LOG_LEVEL=DEBUG to get more details of the failures."
+                f"Set TLLM_LOG_LEVEL=DEBUG to get more details of the failures.",
+                key=custom_op,
             )
 
         # Get the best runner and tactic from cache
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -150,7 +150,7 @@ def _(scores, scores_with_bias, n_group, topk_group, topk,
     def _(input, force_applying_finalize):
         return torch.empty_like(input)
 
-    @torch.library.register_fake("trtllm::fp8_block_scaling_gemm")
+    @torch.library.register_fake("trtllm::fp8_block_scaling_gemm_impl")
     def _(a, b, a_scale, b_scale):
         m = a.shape[0]
         n = b.shape[0]
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -903,7 +903,7 @@ def _(
     return input.new_empty((M, N), dtype=output_dtype)
 
 
-def fp8_swap_ab_gen_tuning_buckets(x: int):
+def deep_gemm_gen_tuning_buckets(x: int):
     buckets = tuple(range(8, 128, 8))
     if x >= 128:
         buckets += tuple(range(128, x, 128))
@@ -913,7 +913,7 @@ def fp8_swap_ab_gen_tuning_buckets(x: int):
 class fp8SwapABGemmRunner(TunableRunner):
     tuning_config = TuningConfig(
         dynamic_tensor_specs=(DynamicTensorSpec(
-            0, 0, fp8_swap_ab_gen_tuning_buckets), ),
+            0, 0, deep_gemm_gen_tuning_buckets), ),
         tune_max_num_tokens=4096,
     )
 
@@ -992,6 +992,78 @@ def _(
     return input.new_empty((input.size(0), weight.size(0)), dtype=output_dtype)
 
 
+# The runner is used to trigger deepgemm jit during autotune.
+class Fp8BlockScalingGemmRunner(TunableRunner):
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0, deep_gemm_gen_tuning_buckets), ),
+        tune_max_num_tokens=4096,
+    )
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[int]:
+        return [0]
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+        a, b, a_scale, b_scale = inputs
+        return torch.ops.trtllm.fp8_block_scaling_gemm_impl(
+            a, b, a_scale, b_scale)
+
+
+def get_fp8_block_scaling_gemm_constraint_spec():
+    # The implementation aligns with the fp8_quantize_1x128 custom op.
+    def fp8_quantize_1x128_sm90_constrant(inputs: List[List[int]]):
+        pad_m = fp4_utils.pad_up(inputs[0][0], 4)
+        blocked_n = (inputs[0][1] + 127) // 128
+        return fp4_utils.pad_up(pad_m * blocked_n * 4, 128) // 4
+
+    if get_sm_version() >= 100:
+        return (ConstraintSpec(2, 1, lambda inputs: inputs[0][0]), )
+    else:
+        return (ConstraintSpec(2, 0, fp8_quantize_1x128_sm90_constrant), )
+
+
+@torch.library.custom_op("trtllm::fp8_block_scaling_gemm", mutates_args=())
+def fp8_block_scaling_gemm(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_scale: torch.Tensor,
+    tune_max_num_tokens: int = 4096,
+) -> torch.Tensor:
+    tuner = AutoTuner.get()
+    fp8_block_scaling_gemm_runner = Fp8BlockScalingGemmRunner()
+    Fp8BlockScalingGemmRunner.tuning_config.tune_max_num_tokens = tune_max_num_tokens
+
+    Fp8BlockScalingGemmRunner.tuning_config.constraint_specs = get_fp8_block_scaling_gemm_constraint_spec(
+    )
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::fp8_block_scaling_gemm",
+        [fp8_block_scaling_gemm_runner],
+        Fp8BlockScalingGemmRunner.tuning_config,
+        [a, b, a_scale, b_scale],
+    )
+    return fp8_block_scaling_gemm_runner(
+        inputs=[a, b, a_scale, b_scale],
+        tactic=best_tactic,
+    )
+
+
+@fp8_block_scaling_gemm.register_fake
+def _(a, b, a_scale, b_scale, tune_max_num_tokens=4096):
+    m = a.shape[0]
+    n = b.shape[0]
+    return a.new_empty((m, n), dtype=torch.bfloat16)
+
+
 @torch.library.custom_op("trtllm::silu_and_mul", mutates_args=())
 def silu_and_mul(x: torch.Tensor,
                  scale: Optional[torch.Tensor] = None,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -531,14 +531,17 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):
             return result
 
         def get_warmup_request(num_tokens: int,
-                               num_gen_tokens: int,
+                               num_gen_requests: int,
                                least_requests: bool = True):
             available_tokens = kv_cache_manager.get_num_available_tokens(
                 self.runtime_draft_len)
             available_blocks = kv_cache_manager.get_num_free_blocks()
             if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
                 return None
-            if num_gen_tokens > self.batch_size:
+            if num_gen_requests > self.batch_size:
+                return None
+            num_gen_tokens = num_gen_requests * (1 + self.runtime_draft_len)
+            if num_gen_tokens > self.max_num_tokens:
                 return None
 
             num_extra_decoding_steps = get_num_extra_decoding_steps()
@@ -548,7 +551,8 @@ def get_warmup_request(num_tokens: int,
                 # during warmup.
                 return None
 
-            num_ctx_tokens = num_tokens - num_gen_tokens
+            num_ctx_tokens = num_tokens - num_gen_requests * (
+                1 + self.runtime_draft_len)
             num_ctx_requests = 0
             ctx_requests = []
             gen_requests = []
@@ -557,7 +561,7 @@ def get_warmup_request(num_tokens: int,
             num_full_seqs = 0
             num_left_over_tokens = 0
 
-            max_context_requests = self.batch_size - num_gen_tokens
+            max_context_requests = self.batch_size - num_gen_requests
             if max_context_requests * max_seq_len < num_ctx_tokens:
                 return None
 
@@ -572,7 +576,7 @@ def get_warmup_request(num_tokens: int,
 
                 else:
                     max_bs = min(num_ctx_tokens,
-                                 self.batch_size - num_gen_tokens)
+                                 self.batch_size - num_gen_requests)
                     if num_ctx_tokens % max_bs == 0:
                         num_full_seqs = max_bs
                     else:
@@ -583,13 +587,13 @@ def get_warmup_request(num_tokens: int,
                                                     > 0 else 0)
 
             # We do not have enough batch to fill the request
-            if num_ctx_requests + num_gen_tokens > self.batch_size:
+            if num_ctx_requests + num_gen_requests > self.batch_size:
                 return None
 
             blocks_to_use = num_full_seqs * math.ceil(
                 max_seq_len / kv_cache_manager.tokens_per_block) + math.ceil(
                     num_left_over_tokens /
-                    kv_cache_manager.tokens_per_block) + num_gen_tokens
+                    kv_cache_manager.tokens_per_block) + num_gen_requests
 
             if blocks_to_use > available_blocks:
                 return None
@@ -604,25 +608,29 @@ def get_warmup_request(num_tokens: int,
                     token_nums=ctx_token_nums,
                     is_gen=False,
                     max_num_draft_tokens=self.runtime_draft_len,
-                    use_mrope=self.use_mrope)
+                    use_mrope=self.use_mrope,
+                    max_beam_width=self.max_beam_width,
+                    num_extra_decoding_steps=num_extra_decoding_steps)
 
                 if spec_resource_manager is not None:
                     spec_resource_manager.add_dummy_requests(
                         request_ids=list(range(num_ctx_requests)))
 
-            if num_gen_tokens > 0:
+            if num_gen_requests > 0:
                 gen_requests = kv_cache_manager.add_dummy_requests(
                     list(
                         range(num_ctx_requests,
-                              num_ctx_requests + num_gen_tokens)),
-                    token_nums=[1] * num_gen_tokens,
+                              num_ctx_requests + num_gen_requests)),
+                    token_nums=[1] * num_gen_requests,
                     is_gen=True,
                     max_num_draft_tokens=self.max_draft_len,
-                    use_mrope=self.use_mrope)
+                    use_mrope=self.use_mrope,
+                    max_beam_width=self.max_beam_width,
+                    num_extra_decoding_steps=num_extra_decoding_steps)
                 if spec_resource_manager is not None:
                     spec_resource_manager.add_dummy_requests(request_ids=list(
                         range(num_ctx_requests, num_ctx_requests +
-                              num_gen_tokens)))
+                              num_gen_requests)))
 
             result = ScheduledRequests()
             result.context_requests = ctx_requests
@@ -655,15 +663,18 @@ def release_batch(result: ScheduledRequests | None):
             return
 
         def general_warmup(reverse: bool = False):
+            max_batch_size = min(
+                self.batch_size,
+                curr_max_num_tokens // (1 + self.runtime_draft_len))
             warmup_requests = set([
                 (1, 1),  # Specialize for 1 token.
-                (self.batch_size,
-                 self.batch_size),  # max_batch_size, pure generation
+                (max_batch_size,
+                 max_batch_size),  # max_batch_size, pure generation
                 (2, 0),  # Non-one, pure context
                 (curr_max_num_tokens, 0),  # max_num_tokens, pure context
             ])
-            if reverse:
-                warmup_requests = sorted(list(warmup_requests), reverse=reverse)
+
+            warmup_requests = sorted(list(warmup_requests), reverse=reverse)
 
             for warmup_num_tokens, warmup_num_gen_tokens in warmup_requests:
                 with release_batch(
@@ -817,6 +828,7 @@ def _update_draft_inference_state(is_first_draft: bool,
             # Also, we run a general warmup from large to small to make sure that blocks are allocated well.
             # The cudagraph and piecewise cuda graph capture calls torch.cuda.empty_cache() and block may already
             # be freed even we calls general_warmup for torch compile.
+            # Also the additional warmup helps trigger the runtime jit to avoid runtime jit overhead.
             general_warmup(reverse=True)
 
         # Set the value back to the original value
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -470,9 +470,9 @@ def _update_target_inputs_with_draft_tokens(
                     continue
 
                 # Get the index of the draft/target tokens in the device tensor
-                draft_idx = req_idx if self.use_static_draft_loop else request.py_batch_idx
+                draft_idx = req_idx if self.use_static_draft_loop else request.py_seq_slot
                 target_idx = req_id_to_old_request[
-                    request.py_request_id].py_batch_idx
+                    request.py_request_id].py_seq_slot
                 target_inputs.new_tokens[draft_position + 1:draft_position +
                                          draft_length + 1, target_idx,
                                          0] = draft_tensors[0:draft_length,
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -404,7 +404,6 @@ def test_auto_dtype(self, disable_overlap_scheduler):
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(2)
-    @skip_pre_hopper
     def test_ngram(self):
         speculative_decoding_config = {
             "decoding_type": "NGram",
diff --git a/tests/integration/defs/examples/serve/test_serve.py b/tests/integration/defs/examples/serve/test_serve.py
@@ -2,7 +2,7 @@
 import time
 
 import requests
-from defs.conftest import llm_models_root, skip_pre_hopper
+from defs.conftest import llm_models_root, skip_no_hopper
 from defs.trt_test_alternative import popen, print_error, print_info
 from openai import OpenAI
 from requests.exceptions import RequestException
@@ -92,9 +92,11 @@ def check_openai_chat_completion(http_port="8000",
         raise
 
 
-@skip_pre_hopper
+@skip_no_hopper
 def test_extra_llm_api_options(serve_test_root):
     test_configs_root = f"{serve_test_root}/test_configs"
+
+    # moe backend = CUTLASS which only supports fp8 blockscale on Hopper
     config_file = f"{test_configs_root}/Qwen3-30B-A3B-FP8.yml"
     model_path = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8"
 
diff --git a/tests/integration/test_lists/test-db/l0_b300.yml b/tests/integration/test_lists/test-db/l0_b300.yml
@@ -8,6 +8,7 @@ l0_b300:
     wildcards:
       gpu:
       - '*gb110*'
+      - '*b300*'
       linux_distribution_name: ubuntu*
       cpu: x86_64
     terms:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
@@ -8,6 +8,7 @@ l0_dgx_b300:
     wildcards:
       gpu:
       - '*gb110*'
+      - '*b300*'
       linux_distribution_name: ubuntu*
       cpu: x86_64
     terms:
diff --git a/tests/integration/test_lists/test-db/l0_gb300.yml b/tests/integration/test_lists/test-db/l0_gb300.yml
diff --git a/tests/integration/test_lists/test-db/l0_gb300_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb300_multi_gpus.yml
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py