From 0267ce0d8cef2d9525d6f7777717d98e01be1358 Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Mon, 24 Nov 2025 08:33:43 +0000 Subject: [PATCH 1/3] upgrade Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e0319767553..db8469dd8f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -69,7 +69,7 @@ triton==3.5.0; platform_machine == "x86_64" tiktoken blobfile openai-harmony==0.0.4 -nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10" +nvidia-cutlass-dsl==4.3.0; python_version >= "3.10" plotly numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing partial_json_parser From c3d4bfafc74829703e1556c66b8a67b7d6cf8d56 Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 25 Nov 2025 10:55:47 +0000 Subject: [PATCH 2/3] fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- .../blockscaled_contiguous_grouped_gemm_finalize_fusion.py | 7 ++++--- tensorrt_llm/commands/eval.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py index c1c75e54413..576c683b874 100644 --- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py +++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py @@ -1552,6 +1552,8 @@ def kernel( epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs ) + tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype) + copy_atom_r2s = sm100_utils.get_smem_store_op( self.gemm_output_layout, self.out_dtype, self.acc_dtype, tiled_copy_t2r ) @@ -1641,8 +1643,6 @@ def kernel( layout = cute.make_layout(shape=(cute.size(tTR_rAcc),), stride=(1,)) loop_size = cute.size(tTR_rAcc) - rOut_epi = cute.make_rmem_tensor(layout, self.out_dtype) - for subtile_idx in cutlass.range(subtile_cnt): # # Load accumulator from tensor memory buffer to register @@ -1657,7 +1657,8 @@ def kernel( # Apply router scale to the entire row (broadcast scalar to vector) acc_vec_finalized = token_scale * acc_vec_scaled - rOut_epi.store(acc_vec_finalized.to(self.out_dtype)) + tTR_rC.store(acc_vec_finalized.to(self.out_dtype)) + rOut_epi = cute.make_tensor(tTR_rC.iterator, layout) if permuted_row < tile_mn_limit: coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[ diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py index 8a0b4f58261..3ce78420193 100644 --- a/tensorrt_llm/commands/eval.py +++ b/tensorrt_llm/commands/eval.py @@ -135,6 +135,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str, profiler.start("trtllm init") if backend == 'pytorch': + llm_args.pop("build_config", None) llm = PyTorchLLM(**llm_args) elif backend == 'tensorrt': llm = LLM(**llm_args) From f68dff34f853231b857f20ed6a7833a82f807b3a Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 25 Nov 2025 11:00:33 +0000 Subject: [PATCH 3/3] fix Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- ATTRIBUTIONS-Python.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index 53a77c6c9e2..dab27a1e238 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -25250,7 +25250,7 @@ License: `NVIDIA Proprietary Software` - `Homepage`: https://developer.nvidia.com/cusparselt -## nvidia-cutlass-dsl (4.2.1) +## nvidia-cutlass-dsl (4.3.0) ### Licenses License: `None`