diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
index a7f03548aa..f93d5469bd 100644
--- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
+++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -154,7 +154,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(
     bool const smart_routing, int const cluster_rank, int const cluster_size,
     int const num_experts_smem) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   // Use one block to process the min latency case
   int tid = threadIdx.x;
@@ -247,7 +247,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(
     }
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -309,7 +309,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(
 
   // Wait PDL before reading token_selected_experts
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 // build expert map
@@ -350,7 +350,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(
 
 // We are done with compute, launch the dependent kernels while the stores are in flight
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
   // write to shared memory and global memory
@@ -550,7 +550,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts,
   int const token_id = block_id * kNumTokensPerBlock + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   int expanded_token_id = -1;
@@ -579,7 +579,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts,
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -633,7 +633,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
   int cnt = 0;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma
@@ -662,7 +662,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -676,7 +676,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts,
   __shared__ typename BlockScan::TempStorage temp_storage;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   int const cnt = threadIdx.x < num_experts_per_node * num_blocks_per_seq
@@ -696,7 +696,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts,
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -759,7 +759,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts,
   int const token_id = block_id * blockDim.x + threadIdx.x;
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id];
@@ -774,7 +774,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts,
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1241,7 +1241,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   // Both gemms use the same token offset
@@ -1319,7 +1319,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
           quant_params.groupwise.fc2.weight_scales),
       bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1386,7 +1386,7 @@ __global__ void expandInputRowsKernel(
                 "of the expansion");
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
@@ -1508,7 +1508,7 @@ __global__ void expandInputRowsKernel(
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
   // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values
@@ -1710,7 +1710,7 @@ __global__ void finalizeMoeRoutingKernel(
   auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll
@@ -1746,7 +1746,7 @@ __global__ void finalizeMoeRoutingKernel(
     reduced_row_ptr_v[elem_index] = output_elem;
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -1766,7 +1766,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
   assert(unpadded_cols <= padded_cols);
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
@@ -1849,7 +1849,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(
     }
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -2101,7 +2101,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
   int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node];
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) {
     size_t gemm_result_offset = token * inter_size * gated_size_mul;
@@ -2216,7 +2216,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
   // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values
diff --git a/csrc/fused_moe/noAuxTcKernels.cu b/csrc/fused_moe/noAuxTcKernels.cu
index 1f57d9b57b..bcf8f849ae 100644
--- a/csrc/fused_moe/noAuxTcKernels.cu
+++ b/csrc/fused_moe/noAuxTcKernels.cu
@@ -32,7 +32,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx
                                         int64_t const numExpertsPerGroup,
                                         double const routedScalingFactor) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   // declare shared memory structure
@@ -216,7 +216,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h
index 5f71a94886..ab018fc885 100644
--- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h
+++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h
@@ -43,7 +43,7 @@ namespace arch {
 CUTLASS_DEVICE
 void launch_dependent_grids() {
 #if (defined(CUTLASS_GDC_ENABLED))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -53,7 +53,7 @@ void launch_dependent_grids() {
 CUTLASS_DEVICE
 void wait_on_dependent_grids() {
 #if (defined(CUTLASS_GDC_ENABLED))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 }
 
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
index 72ba3a812d..32965c3d70 100644
--- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
+++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh
@@ -226,7 +226,7 @@ quantize_with_block_size(
   int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD;
   int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD;
 
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 
   // Input tensor batch/row/col loops.
   // Optimization: Iterate over actual rows first (hot path), then padding rows (cold path)
@@ -313,7 +313,7 @@ quantize_with_block_size(
       }
     }
   }
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -376,7 +376,7 @@ quantize_with_block_size_tma(
   int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, rowTile) : numRows;
   int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols;
 
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 
   // TMA barrier initialization.
   if (warpIdx == 0 and laneIdx == 0) {
@@ -501,7 +501,7 @@ quantize_with_block_size_tma(
       }
     }
   }
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/csrc/xqa/utils.cuh b/csrc/xqa/utils.cuh
index a9ac1805b9..485ec1ac93 100644
--- a/csrc/xqa/utils.cuh
+++ b/csrc/xqa/utils.cuh
@@ -768,13 +768,13 @@ __device__ inline bool warpElectSync() {
 
 __device__ inline void preExit() {
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-  asm volatile("griddepcontrol.launch_dependents;\n");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
 __device__ inline void acqBulk() {
 #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-  asm volatile("griddepcontrol.wait;\n");
+  cudaGridDependencySynchronize();
 #endif
 }
 
diff --git a/include/flashinfer/activation.cuh b/include/flashinfer/activation.cuh
index 6e9f029923..b57b5c62ed 100644
--- a/include/flashinfer/activation.cuh
+++ b/include/flashinfer/activation.cuh
@@ -34,7 +34,7 @@ __global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ in
   const int64_t offset = token_idx * 2 * d;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll 1
@@ -59,7 +59,7 @@ __global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ in
   }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
index ebf38d7347..a3707e327c 100644
--- a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
+++ b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp
@@ -178,7 +178,7 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized {
 
   CUTLASS_DEVICE void operator()(const Params& params, char* smem) {
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     TileScheduler tile_scheduler{params.tile_scheduler};
diff --git a/include/flashinfer/attention/blackwell/plan.cuh b/include/flashinfer/attention/blackwell/plan.cuh
index 445e9a4954..ead3f4e6ea 100644
--- a/include/flashinfer/attention/blackwell/plan.cuh
+++ b/include/flashinfer/attention/blackwell/plan.cuh
@@ -140,7 +140,7 @@ __global__ void plan_kernel(int* qo_segment_offsets, int* kv_segment_offsets, in
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/attention/cascade.cuh b/include/flashinfer/attention/cascade.cuh
index 09f00f6852..9bcbbf1824 100644
--- a/include/flashinfer/attention/cascade.cuh
+++ b/include/flashinfer/attention/cascade.cuh
@@ -381,7 +381,7 @@ __global__ void PersistentVariableLengthMergeStatesKernel(
   float* s_smem = (float*)(smem + num_smem_stages * bdy * head_dim * sizeof(DTypeIn));
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll 1
@@ -462,7 +462,7 @@ __global__ void PersistentVariableLengthMergeStatesKernel(
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -485,7 +485,7 @@ __global__ void PersistentVariableLengthAttentionSumKernel(DTypeIn* __restrict__
 
   vec_t<float, vec_size> v_sum_vec;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll 1
@@ -548,7 +548,7 @@ __global__ void PersistentVariableLengthAttentionSumKernel(DTypeIn* __restrict__
     v_sum_vec.cast_store(v_sum + (pos * num_heads + head_idx) * head_dim + tx * vec_size);
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/attention/decode.cuh b/include/flashinfer/attention/decode.cuh
index cb0ad3be7c..eb6d1ef18d 100644
--- a/include/flashinfer/attention/decode.cuh
+++ b/include/flashinfer/attention/decode.cuh
@@ -458,7 +458,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param
                        float(2 * ((tx * vec_size + i) % (head_dim / 2))) / float(head_dim));
     }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     // apply rotary embedding to q matrix
     q_vec = vec_apply_llama_rope<vec_size, bdx>(
@@ -466,7 +466,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param
   } else {
 // do not apply rotary embedding to q matrix
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
     q_vec.cast_load(q + batch_idx * q_stride_n + qo_head_idx * q_stride_h + tx * vec_size);
   }
@@ -603,7 +603,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -962,7 +962,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) {
                                                           float(head_dim_kpe));
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   // load q_nope and q_pe tile
 #pragma unroll
@@ -1089,7 +1089,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) {
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh
index 5db013bb03..bb406a54e0 100644
--- a/include/flashinfer/attention/prefill.cuh
+++ b/include/flashinfer/attention/prefill.cuh
@@ -1831,7 +1831,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchPrefillWithRaggedKV
         get_warp_idx_q<KTraits>(tid.y) * NUM_MMA_Q * 16 + lane_idx % 16, lane_idx / 16);
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     load_q_global_smem<KTraits>(qo_packed_idx_base, qo_upper_bound, q_ptr_base, q_stride_n,
@@ -2014,7 +2014,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchPrefillWithRaggedKV
       }
     }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 #if (__CUDA_ARCH__ < 800)
   }
@@ -2144,7 +2144,7 @@ __device__ __forceinline__ void BatchPrefillWithPagedKVCacheDevice(
         get_warp_idx_q<KTraits>(tid.y) * NUM_MMA_Q * 16 + lane_idx % 16, lane_idx / 16);
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.wait;");
+    cudaGridDependencySynchronize();
 #endif
 
     load_q_global_smem<KTraits>(qo_packed_idx_base, qo_upper_bound, q_ptr_base, q_stride_n,
@@ -2404,7 +2404,7 @@ __device__ __forceinline__ void BatchPrefillWithPagedKVCacheDevice(
     }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-    asm volatile("griddepcontrol.launch_dependents;");
+    cudaTriggerProgrammaticLaunchCompletion();
 #endif
 
 #if (__CUDA_ARCH__ < 800)
diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
index 143e25de9c..a8e3eb78fa 100644
--- a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
+++ b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh
@@ -934,7 +934,7 @@ template <typename T, int NRanks, bool AllReduceOut, bool ResidualOut, bool Norm
 __global__ void moereduce_allreduce_fusion_kernel_oneshot_lamport(
     MoeReductionAllReduceFusionParams<T> params) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
@@ -1075,7 +1075,7 @@ __global__ void moereduce_allreduce_fusion_kernel_oneshot_lamport(
 #endif
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/gemm/dsv3_router_gemm.cuh b/include/flashinfer/gemm/dsv3_router_gemm.cuh
index aa9811c985..6d188b0c38 100644
--- a/include/flashinfer/gemm/dsv3_router_gemm.cuh
+++ b/include/flashinfer/gemm/dsv3_router_gemm.cuh
@@ -70,7 +70,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(Tout* out, Tin cons
   }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   // Process the GEMM in chunks
@@ -154,7 +154,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(Tout* out, Tin cons
     }
   }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 }  // namespace flashinfer::trtllm_dsv3_router_gemm
diff --git a/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh b/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh
index 03ece19e5f..3773b26b66 100644
--- a/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh
+++ b/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh
@@ -46,8 +46,8 @@ __global__ void compute_sm100_cutlass_group_gemm_args(
   int sf_n = n / scale_granularity_n;
   int sf_k = k / scale_granularity_k;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaGridDependencySynchronize();
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
   int m_offset = m_indptr[i];
   int m_offset_next = m_indptr[i + 1];
diff --git a/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh b/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh
index 8856a218a1..9dedb9b1a1 100644
--- a/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh
+++ b/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh
@@ -57,8 +57,8 @@ __global__ void compute_sm100_cutlass_group_gemm_args(
   int swizzled_k = (k + alignment_swizzled_k - 1) / alignment_swizzled_k * alignment_swizzled_k;
   int sf_k = swizzled_k / ScaleGranularity;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaGridDependencySynchronize();
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
   int m_offset = m_indptr[i];
   int m_offset_next = m_indptr[i + 1];
diff --git a/include/flashinfer/norm.cuh b/include/flashinfer/norm.cuh
index 6814e892d1..8686bbebfa 100644
--- a/include/flashinfer/norm.cuh
+++ b/include/flashinfer/norm.cuh
@@ -50,7 +50,7 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T*
   float sum_sq = 0.f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t i = 0; i < rounds; i++) {
@@ -106,7 +106,7 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T*
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -164,7 +164,7 @@ __global__ void RMSNormQuantKernel(T* __restrict__ input, T* __restrict__ weight
   float sum_sq = 0.f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t i = 0; i < rounds; i++) {
@@ -222,7 +222,7 @@ __global__ void RMSNormQuantKernel(T* __restrict__ input, T* __restrict__ weight
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -280,7 +280,7 @@ __global__ void QKRMSNormKernel(T* __restrict__ input, T* __restrict__ weight,
   const uint32_t rounds = ceil_div(d, VEC_SIZE * num_threads);
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t job_idx = worker_idx; job_idx < num_jobs; job_idx += num_workers) {
@@ -335,7 +335,7 @@ __global__ void QKRMSNormKernel(T* __restrict__ input, T* __restrict__ weight,
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -400,7 +400,7 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res
 
   float sum_sq = 0.f;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t i = 0; i < rounds; i++) {
@@ -472,7 +472,7 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -533,7 +533,7 @@ __global__ void FusedAddRMSNormQuantKernel(T* __restrict__ input, T* __restrict_
 
   float sum_sq = 0.f;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t i = 0; i < rounds; i++) {
@@ -606,7 +606,7 @@ __global__ void FusedAddRMSNormQuantKernel(T* __restrict__ input, T* __restrict_
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh
index 4fdd75e0a3..c03b188642 100644
--- a/include/flashinfer/pos_enc.cuh
+++ b/include/flashinfer/pos_enc.cuh
@@ -438,7 +438,7 @@ __global__ void RopeQuantizeKernel(
     size_t k_rope_out_stride, size_t k_rope_out_stride_h, size_t k_nope_out_stride,
     size_t k_nope_out_stride_h, float quant_scale_q, float quant_scale_kv) {  // generalized kernel
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
   uint32_t by = blockIdx.y;
@@ -570,7 +570,7 @@ __global__ void RopeQuantizeKernel(
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -812,7 +812,7 @@ __global__ void RopeQuantizeAppendPagedKVCacheKernel(
     float* __restrict__ cos_sin_cache, RoPEIdType* __restrict__ pos_ids,
     const RopeQuantizeAppendPagedKVCacheParams params) {
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y;
   uint32_t by = blockIdx.y;
@@ -1025,7 +1025,7 @@ __global__ void RopeQuantizeAppendPagedKVCacheKernel(
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
index ee9a4924db..8873b663eb 100644
--- a/include/flashinfer/sampling.cuh
+++ b/include/flashinfer/sampling.cuh
@@ -301,7 +301,7 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te
   float threadlocal_running_denominator = 0.0f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   // Pass 1: Compute running max and denominator
@@ -389,7 +389,7 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -420,7 +420,7 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part
   float threadlocal_running_denominator = 0.0f;
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
 #pragma unroll 2
@@ -474,7 +474,7 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part
     partial_results[bx * num_slices + by] = {running_max, running_denominator};
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
@@ -498,7 +498,7 @@ __global__ void OnlineSoftmaxReduceKernel(DType* logits, DType* output,
   float2 thread_aggregate = make_float2(-cuda::std::numeric_limits<float>::infinity(), 0.0f);
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
 
   for (uint32_t i = tx; i < num_slices; i += BLOCK_THREADS) {
@@ -545,7 +545,7 @@ __global__ void OnlineSoftmaxReduceKernel(DType* logits, DType* output,
     }
   }
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }
 
diff --git a/include/flashinfer/trtllm/fmha/lse.cuh b/include/flashinfer/trtllm/fmha/lse.cuh
index b41d084ace..fb93a75083 100644
--- a/include/flashinfer/trtllm/fmha/lse.cuh
+++ b/include/flashinfer/trtllm/fmha/lse.cuh
@@ -27,14 +27,14 @@ __global__ void ComputeLSEFromMDKernel(float2* __restrict__ md, float* __restric
   int elem_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (elem_idx >= n) return;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.wait;");
+  cudaGridDependencySynchronize();
 #endif
   float2 md_elem = md[elem_idx];
   float m = md_elem.x;
   float d = md_elem.y;
   lse[elem_idx] = math::log2e * m + math::ptx_log2(d);
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
+  cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }