ROCm · AviralGoelAMD · Apr 10, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));

@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));

@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));

@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));

@@ -238,16 +238,6 @@ int main(int argc, char* argv[])
 
         Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
 
-#if 0
-        for(int n = 0; n < N; ++n)
-        {
-            for(int k = 0; k < K; ++k)
-            {
-                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
-            }
-        }
-#endif
-
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
                                                                                 B0DataType,
                                                                                 CShuffleDataType,

@@ -252,9 +252,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
-    // max_token_id.mData[0] = valid_size;
-    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
     // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     for(int i = 0; i < sorted_tile_num; i++)

@@ -261,16 +261,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
 
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    // int eids[]         = {0, 1, 3, 3, 3};
-    //  int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    // int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
-    // int eids[]         = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    //                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    //                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    //                     3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
-    //                     5, 5, 5, 5, 6, 6, 6, 6, 7, 7,
-    //                     7, 7,
-    //                     3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);

@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);

@@ -476,16 +476,6 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             hip_check_error(hipGetLastError());
             // end real kernel
 
-            //             hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
-            //             hip_check_error(hipEventSynchronize(stop));
-            //             float cur_time = 0;
-            //             hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
-            // #if MEDIAN
-            //             times.insert(cur_time);
-            // #else
-            //             total_time += cur_time;
-            // #endif
-
 #if !defined(CK_USE_WMMA)
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {

@@ -137,13 +137,6 @@ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-    // const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
-    //     out_n_do_ho_wo_k_grid_desc,
-    //     make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-    //                make_pass_through_transform(K)),
-    //     make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}),
-    //     make_tuple(Sequence<0>{}, Sequence<1>{}));
-
     return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1,
                       wei_grid_desc_gemmk0_gemmn_gemmk1,
                       out_grid_desc_gemmm_gemmn);

@@ -60,32 +60,6 @@ constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector()
             NRepeat,
             KPack>{};
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");

@@ -93,32 +93,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
                 KPack>{};
         }
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");

@@ -144,12 +144,6 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),

@@ -152,12 +152,6 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),

@@ -737,11 +737,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
 
         // Batch Offset
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-
-        // for checking vector load/store
-        // index_t MRaw_;
-        // index_t NRaw_;
-        // index_t KRaw_;
     };
 
     // Invoker