From 0267ce0d8cef2d9525d6f7777717d98e01be1358 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Mon, 24 Nov 2025 08:33:43 +0000
Subject: [PATCH 1/3] upgrade

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e0319767553..db8469dd8f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -69,7 +69,7 @@ triton==3.5.0; platform_machine == "x86_64"
 tiktoken
 blobfile
 openai-harmony==0.0.4
-nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10"
+nvidia-cutlass-dsl==4.3.0; python_version >= "3.10"
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser

From c3d4bfafc74829703e1556c66b8a67b7d6cf8d56 Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Tue, 25 Nov 2025 10:55:47 +0000
Subject: [PATCH 2/3] fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 .../blockscaled_contiguous_grouped_gemm_finalize_fusion.py | 7 ++++---
 tensorrt_llm/commands/eval.py                              | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
index c1c75e54413..576c683b874 100644
--- a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
+++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/blockscaled_contiguous_grouped_gemm_finalize_fusion.py
@@ -1552,6 +1552,8 @@ def kernel(
                 epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
             )
 
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.out_dtype)
+
             copy_atom_r2s = sm100_utils.get_smem_store_op(
                 self.gemm_output_layout, self.out_dtype, self.acc_dtype, tiled_copy_t2r
             )
@@ -1641,8 +1643,6 @@ def kernel(
                     layout = cute.make_layout(shape=(cute.size(tTR_rAcc),), stride=(1,))
                     loop_size = cute.size(tTR_rAcc)
 
-                rOut_epi = cute.make_rmem_tensor(layout, self.out_dtype)
-
                 for subtile_idx in cutlass.range(subtile_cnt):
                     #
                     # Load accumulator from tensor memory buffer to register
@@ -1657,7 +1657,8 @@ def kernel(
                     # Apply router scale to the entire row (broadcast scalar to vector)
                     acc_vec_finalized = token_scale * acc_vec_scaled
 
-                    rOut_epi.store(acc_vec_finalized.to(self.out_dtype))
+                    tTR_rC.store(acc_vec_finalized.to(self.out_dtype))
+                    rOut_epi = cute.make_tensor(tTR_rC.iterator, layout)
 
                     if permuted_row < tile_mn_limit:
                         coord_n = mma_tile_coord_mnl[1] * self.cta_tile_shape_mnk[
diff --git a/tensorrt_llm/commands/eval.py b/tensorrt_llm/commands/eval.py
index 8a0b4f58261..3ce78420193 100644
--- a/tensorrt_llm/commands/eval.py
+++ b/tensorrt_llm/commands/eval.py
@@ -135,6 +135,7 @@ def main(ctx, model: str, tokenizer: Optional[str], log_level: str,
 
     profiler.start("trtllm init")
     if backend == 'pytorch':
+        llm_args.pop("build_config", None)
         llm = PyTorchLLM(**llm_args)
     elif backend == 'tensorrt':
         llm = LLM(**llm_args)

From f68dff34f853231b857f20ed6a7833a82f807b3a Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Tue, 25 Nov 2025 11:00:33 +0000
Subject: [PATCH 3/3] fix

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 ATTRIBUTIONS-Python.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md
index 53a77c6c9e2..dab27a1e238 100644
--- a/ATTRIBUTIONS-Python.md
+++ b/ATTRIBUTIONS-Python.md
@@ -25250,7 +25250,7 @@ License: `NVIDIA Proprietary Software`
   - `Homepage`: https://developer.nvidia.com/cusparselt
 
 
-## nvidia-cutlass-dsl (4.2.1)
+## nvidia-cutlass-dsl (4.3.0)
 
 ### Licenses
 License: `None`