diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh index a7f03548aa..f93d5469bd 100644 --- a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh +++ b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh @@ -154,7 +154,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel( bool const smart_routing, int const cluster_rank, int const cluster_size, int const num_experts_smem) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Use one block to process the min latency case int tid = threadIdx.x; @@ -247,7 +247,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel( } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -309,7 +309,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel( // Wait PDL before reading token_selected_experts #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // build expert map @@ -350,7 +350,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel( // We are done with compute, launch the dependent kernels while the stores are in flight #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // write to shared memory and global memory @@ -550,7 +550,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, int const token_id = block_id * kNumTokensPerBlock + threadIdx.x; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int expanded_token_id = -1; @@ -579,7 +579,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -633,7 +633,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count int cnt = 0; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma @@ -662,7 +662,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -676,7 +676,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, __shared__ typename BlockScan::TempStorage temp_storage; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int const cnt = threadIdx.x < num_experts_per_node * num_blocks_per_seq @@ -696,7 +696,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -759,7 +759,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int const token_id = block_id * blockDim.x + threadIdx.x; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id]; @@ -774,7 +774,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1241,7 +1241,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel( } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Both gemms use the same token offset @@ -1319,7 +1319,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel( quant_params.groupwise.fc2.weight_scales), bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1386,7 +1386,7 @@ __global__ void expandInputRowsKernel( "of the expansion"); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize @@ -1508,7 +1508,7 @@ __global__ void expandInputRowsKernel( } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values @@ -1710,7 +1710,7 @@ __global__ void finalizeMoeRoutingKernel( auto* reduced_row_ptr_v = reinterpret_cast(reduced_row_ptr); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll @@ -1746,7 +1746,7 @@ __global__ void finalizeMoeRoutingKernel( reduced_row_ptr_v[elem_index] = output_elem; } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1766,7 +1766,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel( assert(unpadded_cols <= padded_cols); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node]; @@ -1849,7 +1849,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel( } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -2101,7 +2101,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node]; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) { size_t gemm_result_offset = token * inter_size * gated_size_mul; @@ -2216,7 +2216,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values diff --git a/csrc/fused_moe/noAuxTcKernels.cu b/csrc/fused_moe/noAuxTcKernels.cu index 1f57d9b57b..bcf8f849ae 100644 --- a/csrc/fused_moe/noAuxTcKernels.cu +++ b/csrc/fused_moe/noAuxTcKernels.cu @@ -32,7 +32,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx int64_t const numExpertsPerGroup, double const routedScalingFactor) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // declare shared memory structure @@ -216,7 +216,7 @@ __global__ void deepseek_v3_topk_kernel(InputT* scores, OutputT* topkValues, Idx } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h index 5f71a94886..ab018fc885 100644 --- a/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h +++ b/csrc/nv_internal/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h @@ -43,7 +43,7 @@ namespace arch { CUTLASS_DEVICE void launch_dependent_grids() { #if (defined(CUTLASS_GDC_ENABLED)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -53,7 +53,7 @@ void launch_dependent_grids() { CUTLASS_DEVICE void wait_on_dependent_grids() { #if (defined(CUTLASS_GDC_ENABLED)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif } diff --git a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh index 72ba3a812d..32965c3d70 100644 --- a/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh +++ b/csrc/nv_internal/tensorrt_llm/kernels/quantization.cuh @@ -226,7 +226,7 @@ quantize_with_block_size( int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD; int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD; - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); // Input tensor batch/row/col loops. // Optimization: Iterate over actual rows first (hot path), then padding rows (cold path) @@ -313,7 +313,7 @@ quantize_with_block_size( } } } - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -376,7 +376,7 @@ quantize_with_block_size_tma( int numPaddedRowsForSf = isSfSwizzledLayout ? PadUpFn(numRows, rowTile) : numRows; int numColsForSf = isSfSwizzledLayout ? PadUpFn(numPaddedCols, 4 * SF_VEC_SIZE) : numPaddedCols; - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); // TMA barrier initialization. if (warpIdx == 0 and laneIdx == 0) { @@ -501,7 +501,7 @@ quantize_with_block_size_tma( } } } - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/csrc/xqa/utils.cuh b/csrc/xqa/utils.cuh index a9ac1805b9..485ec1ac93 100644 --- a/csrc/xqa/utils.cuh +++ b/csrc/xqa/utils.cuh @@ -768,13 +768,13 @@ __device__ inline bool warpElectSync() { __device__ inline void preExit() { #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm volatile("griddepcontrol.launch_dependents;\n"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } __device__ inline void acqBulk() { #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) - asm volatile("griddepcontrol.wait;\n"); + cudaGridDependencySynchronize(); #endif } diff --git a/include/flashinfer/activation.cuh b/include/flashinfer/activation.cuh index 6e9f029923..b57b5c62ed 100644 --- a/include/flashinfer/activation.cuh +++ b/include/flashinfer/activation.cuh @@ -34,7 +34,7 @@ __global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ in const int64_t offset = token_idx * 2 * d; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll 1 @@ -59,7 +59,7 @@ __global__ void act_and_mul_kernel(T* __restrict__ out, const T* __restrict__ in } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp index ebf38d7347..a3707e327c 100644 --- a/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp +++ b/include/flashinfer/attention/blackwell/kernel/sm100_fmha_fwd_kernel_tma_warpspecialized.hpp @@ -178,7 +178,7 @@ struct Sm100FmhaFwdKernelTmaWarpspecialized { CUTLASS_DEVICE void operator()(const Params& params, char* smem) { #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif TileScheduler tile_scheduler{params.tile_scheduler}; diff --git a/include/flashinfer/attention/blackwell/plan.cuh b/include/flashinfer/attention/blackwell/plan.cuh index 445e9a4954..ead3f4e6ea 100644 --- a/include/flashinfer/attention/blackwell/plan.cuh +++ b/include/flashinfer/attention/blackwell/plan.cuh @@ -140,7 +140,7 @@ __global__ void plan_kernel(int* qo_segment_offsets, int* kv_segment_offsets, in } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/attention/cascade.cuh b/include/flashinfer/attention/cascade.cuh index 09f00f6852..9bcbbf1824 100644 --- a/include/flashinfer/attention/cascade.cuh +++ b/include/flashinfer/attention/cascade.cuh @@ -381,7 +381,7 @@ __global__ void PersistentVariableLengthMergeStatesKernel( float* s_smem = (float*)(smem + num_smem_stages * bdy * head_dim * sizeof(DTypeIn)); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll 1 @@ -462,7 +462,7 @@ __global__ void PersistentVariableLengthMergeStatesKernel( } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -485,7 +485,7 @@ __global__ void PersistentVariableLengthAttentionSumKernel(DTypeIn* __restrict__ vec_t v_sum_vec; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll 1 @@ -548,7 +548,7 @@ __global__ void PersistentVariableLengthAttentionSumKernel(DTypeIn* __restrict__ v_sum_vec.cast_store(v_sum + (pos * num_heads + head_idx) * head_dim + tx * vec_size); } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/attention/decode.cuh b/include/flashinfer/attention/decode.cuh index cb0ad3be7c..eb6d1ef18d 100644 --- a/include/flashinfer/attention/decode.cuh +++ b/include/flashinfer/attention/decode.cuh @@ -458,7 +458,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param float(2 * ((tx * vec_size + i) % (head_dim / 2))) / float(head_dim)); } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // apply rotary embedding to q matrix q_vec = vec_apply_llama_rope( @@ -466,7 +466,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param } else { // do not apply rotary embedding to q matrix #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif q_vec.cast_load(q + batch_idx * q_stride_n + qo_head_idx * q_stride_h + tx * vec_size); } @@ -603,7 +603,7 @@ __device__ __inline__ void BatchDecodeWithPagedKVCacheDevice(const Params& param } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -962,7 +962,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) { float(head_dim_kpe)); } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // load q_nope and q_pe tile #pragma unroll @@ -1089,7 +1089,7 @@ __global__ void BatchDecodeWithPagedKVCacheKernelMLA(Params params) { } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/attention/prefill.cuh b/include/flashinfer/attention/prefill.cuh index 5db013bb03..bb406a54e0 100644 --- a/include/flashinfer/attention/prefill.cuh +++ b/include/flashinfer/attention/prefill.cuh @@ -1831,7 +1831,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchPrefillWithRaggedKV get_warp_idx_q(tid.y) * NUM_MMA_Q * 16 + lane_idx % 16, lane_idx / 16); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif load_q_global_smem(qo_packed_idx_base, qo_upper_bound, q_ptr_base, q_stride_n, @@ -2014,7 +2014,7 @@ __global__ __launch_bounds__(KTraits::NUM_THREADS) void BatchPrefillWithRaggedKV } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #if (__CUDA_ARCH__ < 800) } @@ -2144,7 +2144,7 @@ __device__ __forceinline__ void BatchPrefillWithPagedKVCacheDevice( get_warp_idx_q(tid.y) * NUM_MMA_Q * 16 + lane_idx % 16, lane_idx / 16); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif load_q_global_smem(qo_packed_idx_base, qo_upper_bound, q_ptr_base, q_stride_n, @@ -2404,7 +2404,7 @@ __device__ __forceinline__ void BatchPrefillWithPagedKVCacheDevice( } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #if (__CUDA_ARCH__ < 800) diff --git a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh index 143e25de9c..a8e3eb78fa 100644 --- a/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh +++ b/include/flashinfer/comm/trtllm_moe_allreduce_fusion.cuh @@ -934,7 +934,7 @@ template params) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) @@ -1075,7 +1075,7 @@ __global__ void moereduce_allreduce_fusion_kernel_oneshot_lamport( #endif #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/gemm/dsv3_router_gemm.cuh b/include/flashinfer/gemm/dsv3_router_gemm.cuh index aa9811c985..6d188b0c38 100644 --- a/include/flashinfer/gemm/dsv3_router_gemm.cuh +++ b/include/flashinfer/gemm/dsv3_router_gemm.cuh @@ -70,7 +70,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(Tout* out, Tin cons } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Process the GEMM in chunks @@ -154,7 +154,7 @@ __global__ __launch_bounds__(128, 1) void router_gemm_kernel(Tout* out, Tin cons } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } } // namespace flashinfer::trtllm_dsv3_router_gemm diff --git a/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh b/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh index 03ece19e5f..3773b26b66 100644 --- a/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh +++ b/include/flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh @@ -46,8 +46,8 @@ __global__ void compute_sm100_cutlass_group_gemm_args( int sf_n = n / scale_granularity_n; int sf_k = k / scale_granularity_k; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); - asm volatile("griddepcontrol.launch_dependents;"); + cudaGridDependencySynchronize(); + cudaTriggerProgrammaticLaunchCompletion(); #endif int m_offset = m_indptr[i]; int m_offset_next = m_indptr[i + 1]; diff --git a/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh b/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh index 8856a218a1..9dedb9b1a1 100644 --- a/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh +++ b/include/flashinfer/gemm/group_gemm_mxfp4_groupwise_sm100.cuh @@ -57,8 +57,8 @@ __global__ void compute_sm100_cutlass_group_gemm_args( int swizzled_k = (k + alignment_swizzled_k - 1) / alignment_swizzled_k * alignment_swizzled_k; int sf_k = swizzled_k / ScaleGranularity; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); - asm volatile("griddepcontrol.launch_dependents;"); + cudaGridDependencySynchronize(); + cudaTriggerProgrammaticLaunchCompletion(); #endif int m_offset = m_indptr[i]; int m_offset_next = m_indptr[i + 1]; diff --git a/include/flashinfer/norm.cuh b/include/flashinfer/norm.cuh index 6814e892d1..8686bbebfa 100644 --- a/include/flashinfer/norm.cuh +++ b/include/flashinfer/norm.cuh @@ -50,7 +50,7 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T* float sum_sq = 0.f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t i = 0; i < rounds; i++) { @@ -106,7 +106,7 @@ __global__ void RMSNormKernel(T* __restrict__ input, T* __restrict__ weight, T* } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -164,7 +164,7 @@ __global__ void RMSNormQuantKernel(T* __restrict__ input, T* __restrict__ weight float sum_sq = 0.f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t i = 0; i < rounds; i++) { @@ -222,7 +222,7 @@ __global__ void RMSNormQuantKernel(T* __restrict__ input, T* __restrict__ weight } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -280,7 +280,7 @@ __global__ void QKRMSNormKernel(T* __restrict__ input, T* __restrict__ weight, const uint32_t rounds = ceil_div(d, VEC_SIZE * num_threads); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t job_idx = worker_idx; job_idx < num_jobs; job_idx += num_workers) { @@ -335,7 +335,7 @@ __global__ void QKRMSNormKernel(T* __restrict__ input, T* __restrict__ weight, } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -400,7 +400,7 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res float sum_sq = 0.f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t i = 0; i < rounds; i++) { @@ -472,7 +472,7 @@ __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ res } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -533,7 +533,7 @@ __global__ void FusedAddRMSNormQuantKernel(T* __restrict__ input, T* __restrict_ float sum_sq = 0.f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t i = 0; i < rounds; i++) { @@ -606,7 +606,7 @@ __global__ void FusedAddRMSNormQuantKernel(T* __restrict__ input, T* __restrict_ } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh index 4fdd75e0a3..c03b188642 100644 --- a/include/flashinfer/pos_enc.cuh +++ b/include/flashinfer/pos_enc.cuh @@ -438,7 +438,7 @@ __global__ void RopeQuantizeKernel( size_t k_rope_out_stride, size_t k_rope_out_stride_h, size_t k_nope_out_stride, size_t k_nope_out_stride_h, float quant_scale_q, float quant_scale_kv) { // generalized kernel #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y; uint32_t by = blockIdx.y; @@ -570,7 +570,7 @@ __global__ void RopeQuantizeKernel( } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -812,7 +812,7 @@ __global__ void RopeQuantizeAppendPagedKVCacheKernel( float* __restrict__ cos_sin_cache, RoPEIdType* __restrict__ pos_ids, const RopeQuantizeAppendPagedKVCacheParams params) { #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif uint32_t bx = blockIdx.x, tx = threadIdx.x, ty = threadIdx.y; uint32_t by = blockIdx.y; @@ -1025,7 +1025,7 @@ __global__ void RopeQuantizeAppendPagedKVCacheKernel( } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh index ee9a4924db..8873b663eb 100644 --- a/include/flashinfer/sampling.cuh +++ b/include/flashinfer/sampling.cuh @@ -301,7 +301,7 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te float threadlocal_running_denominator = 0.0f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Pass 1: Compute running max and denominator @@ -389,7 +389,7 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -420,7 +420,7 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part float threadlocal_running_denominator = 0.0f; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll 2 @@ -474,7 +474,7 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part partial_results[bx * num_slices + by] = {running_max, running_denominator}; } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -498,7 +498,7 @@ __global__ void OnlineSoftmaxReduceKernel(DType* logits, DType* output, float2 thread_aggregate = make_float2(-cuda::std::numeric_limits::infinity(), 0.0f); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (uint32_t i = tx; i < num_slices; i += BLOCK_THREADS) { @@ -545,7 +545,7 @@ __global__ void OnlineSoftmaxReduceKernel(DType* logits, DType* output, } } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/include/flashinfer/trtllm/fmha/lse.cuh b/include/flashinfer/trtllm/fmha/lse.cuh index b41d084ace..fb93a75083 100644 --- a/include/flashinfer/trtllm/fmha/lse.cuh +++ b/include/flashinfer/trtllm/fmha/lse.cuh @@ -27,14 +27,14 @@ __global__ void ComputeLSEFromMDKernel(float2* __restrict__ md, float* __restric int elem_idx = blockIdx.x * blockDim.x + threadIdx.x; if (elem_idx >= n) return; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif float2 md_elem = md[elem_idx]; float m = md_elem.x; float d = md_elem.y; lse[elem_idx] = math::log2e * m + math::ptx_log2(d); #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif }