NVIDIA · kaiyux · Dec 1, 2025 · Nov 28, 2025
diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu
@@ -120,6 +120,11 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
     auto warp = cg::tiled_partition<WARP_SIZE>(block);
 
     BaseType minScore = BaseType{-INFINITY};
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaGridDependencySynchronize();
+#endif
+
     for (uint32_t tokenId = warpIdx; tokenId < numTokens; tokenId += warpNum)
     {
         auto scoreOffset = tokenId * numExperts;
@@ -168,6 +173,10 @@ __global__ void customMoeRoutingKernel(InputT* routerLogits, OutputT* topkValues
             }
         }
     } // end for tokenId
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    cudaTriggerProgrammaticLaunchCompletion();
+#endif
 }
 
 int nextPowerOfTwo(int num)

diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
@@ -66,6 +66,8 @@ function(process_target target_name enable_hopper enable_blackwell)
   if(${enable_hopper} AND "90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
     # No kernels should be parsed, unless hopper is specified. This is a build
     # time improvement
+    target_compile_options(${target_name}
+                           PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM90=1")
     target_compile_definitions(${target_name} PUBLIC COMPILE_HOPPER_TMA_GEMMS)
     target_compile_definitions(${target_name}
                                PUBLIC COMPILE_HOPPER_TMA_GROUPED_GEMMS)
@@ -78,6 +80,8 @@ function(process_target target_name enable_hopper enable_blackwell)
           OR "121" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
          ))
 
+    target_compile_options(${target_name}
+                           PRIVATE "-DCUTLASS_ENABLE_GDC_FOR_SM100=1")
     # Both 100 and 103 support these kernels
     if("100" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG
        OR "103" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)

@@ -7,6 +7,7 @@ accuracy_model=${2}
 accuracy_tasks=${3}
 model_path=${4}
 model_args_extra=${5}
+output_dir=${6}
 
 echo "Starting accuracy evaluation..."
 echo "Log directory: ${full_logdir}"
@@ -31,9 +32,12 @@ echo "Installing lm_eval[api] and running evaluation..."
 pip install lm_eval[api]==0.4.8
 
 echo "Running lm_eval with tasks: ${accuracy_tasks}..."
+
+mkdir -p ${output_dir}
 lm_eval --model ${accuracy_model} \
     --tasks ${accuracy_tasks} \
     --model_args model=${model_path},base_url=${base_url},${model_args_extra} \
+    --output_path ${output_dir} --log_samples \
     --trust_remote_code
 
 echo "Accuracy evaluation completed successfully"
@@ -276,7 +276,7 @@ if [ "${enable_accuracy_test}" = "true" ]; then
         --mpi=pmix --overlap -N 1 -n 1 \
         bash ${work_dir}/accuracy_eval.sh \
         "${full_logdir}" "${accuracy_model}" "${accuracy_tasks}" "${model_path}" \
-        "${model_args_extra}" \
+        "${model_args_extra}" "${full_logdir}/accuracy_eval" \
         &> ${full_logdir}/accuracy_eval.log; then
         cleanup_on_failure "Accuracy evaluation failed. Check ${full_logdir}/accuracy_eval.log for details"
     fi

@@ -83,7 +83,7 @@ BUILD_CORES_REQUEST = "8"
 BUILD_CORES_LIMIT = "8"
 BUILD_MEMORY_REQUEST = "48Gi"
 BUILD_MEMORY_LIMIT = "96Gi"
-BUILD_JOBS = "8"
+BUILD_JOBS = "4"
 
 SLURM_CORES_REQUEST = "1"
 SLURM_CORES_LIMIT = "1"

@@ -1707,7 +1707,8 @@ def previous_seq_slots_device():
         num_draft_tokens = len(draft_tokens)
         total_num_tokens = len(position_ids)
         assert total_num_tokens <= self.max_num_tokens, (
-            "total_num_tokens should be less than or equal to max_num_tokens")
+            f"total_num_tokens ({total_num_tokens}) should be less than or equal to max_num_tokens ({self.max_num_tokens})"
+        )
         # if exist requests that do not have previous batch, copy input_ids and draft_tokens
         if num_tokens > 0:
             input_ids = torch.tensor(input_ids,