Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
7f22458
Remove dead #if 0 b_element_op loop in gemm_multi_ABD example
AviralGoelAMD Apr 9, 2026
d40f89c
Remove dead #if 0 blocks in example/ck_tile/11_add_rmsnorm2d_rdquant/…
AviralGoelAMD Apr 9, 2026
6571489
Remove dead #if 0 blocks in example/ck_tile/12_smoothquant/instances
AviralGoelAMD Apr 9, 2026
4686201
Remove dead #if 0 blocks in example/ck_tile/14_moe_smoothquant/instances
AviralGoelAMD Apr 9, 2026
0504842
Remove dead #if 0 blocks in ck/tensor_operation/gpu/block
AviralGoelAMD Apr 9, 2026
9aaf85d
Remove dead #if 0 blocks in ck/tensor_operation/gpu/device/impl
AviralGoelAMD Apr 9, 2026
d7cf680
Remove dead #if 0 blocks in ck/tensor_operation/gpu/grid
AviralGoelAMD Apr 9, 2026
24fa6e7
Remove dead #if 0 blocks in ck/utility
AviralGoelAMD Apr 9, 2026
9ab192f
Remove dead #if 0 blocks in ck_tile/core/arch
AviralGoelAMD Apr 9, 2026
8dc6844
Remove dead #if 0 blocks in ck_tile/core/container
AviralGoelAMD Apr 9, 2026
7e6d7a2
Remove dead #if 0 blocks in ck_tile/core/numeric
AviralGoelAMD Apr 9, 2026
72170f7
Remove dead #if 0 blocks in ck_tile/core/tensor
AviralGoelAMD Apr 9, 2026
b723106
Remove dead #if 0 blocks in ck_tile/ops/elementwise
AviralGoelAMD Apr 9, 2026
ec903a1
Remove dead #if 0 blocks in ck_tile/ops/fused_moe/kernel
AviralGoelAMD Apr 9, 2026
a782510
Remove dead #if 0 blocks in ck_tile/ops/gemm/block
AviralGoelAMD Apr 9, 2026
91ae169
Remove dead #if 0 blocks in ck_tile/ops/norm_reduce/block
AviralGoelAMD Apr 9, 2026
03ac154
Remove dead #if 0 blocks in ck_tile/ops/reduce/block
AviralGoelAMD Apr 9, 2026
854d33b
Remove dead #if 0 blocks in lib/ck/lib/tensor_operation_instance/gpu
AviralGoelAMD Apr 9, 2026
090a9ad
Remove dead #if 0 blocks in lib/src/tensor_operation_instance/gpu/gem…
AviralGoelAMD Apr 9, 2026
1985d50
Remove dead #if 0 blocks in lib/src/tensor_operation_instance/gpu/gem…
AviralGoelAMD Apr 9, 2026
7a405ac
Remove dead #if 0 blocks in lib/src/tensor_operation_instance/gpu/gem…
AviralGoelAMD Apr 9, 2026
760fa0e
Remove dead #if 0 blocks in test/ck_tile/add_rmsnorm2d_rdquant/instances
AviralGoelAMD Apr 9, 2026
9a425fe
Remove dead #if 0 blocks in test/ck_tile/moe_smoothquant/instances
AviralGoelAMD Apr 9, 2026
8e65a19
Remove dead #if 0 blocks in test/ck_tile/smoothquant/instances
AviralGoelAMD Apr 9, 2026
26040a8
Remove commented-out dead code in codegen/test
AviralGoelAMD Apr 9, 2026
1f5cd77
Remove commented-out dead code in example/65_gemm_multiply_multiply
AviralGoelAMD Apr 9, 2026
b171695
Remove commented-out dead code in ck/host_utility
AviralGoelAMD Apr 9, 2026
82e8956
Remove commented-out dead code in ck/problem_transform
AviralGoelAMD Apr 9, 2026
130d3a0
Remove commented-out dead code in ck/tensor_operation/gpu/block
AviralGoelAMD Apr 9, 2026
51eac1e
Remove commented-out dead code in ck/tensor_operation/gpu/device/impl
AviralGoelAMD Apr 9, 2026
c9bc0ba
Remove commented-out dead code in ck/tensor_operation/gpu/grid
AviralGoelAMD Apr 9, 2026
e732fdb
Remove commented-out dead code in ck/tensor_operation/gpu/warp
AviralGoelAMD Apr 9, 2026
db20c9c
Remove commented-out dead code in ck/tensor_operation/operator_transform
AviralGoelAMD Apr 9, 2026
7e784c5
Remove commented-out dead code in ck_tile/core/container
AviralGoelAMD Apr 9, 2026
27c2d3f
Remove commented-out dead code in ck_tile/core/tensor
AviralGoelAMD Apr 9, 2026
96049e5
Remove commented-out dead code in ck_tile/ops/flatmm/kernel
AviralGoelAMD Apr 9, 2026
de9e634
Remove commented-out dead code in ck_tile/ops/flatmm/pipeline
AviralGoelAMD Apr 9, 2026
86b70f3
Remove commented-out dead code in ck_tile/ops/fmha/pipeline
AviralGoelAMD Apr 9, 2026
9f89989
Remove commented-out dead code in ck_tile/ops/fused_moe/kernel
AviralGoelAMD Apr 9, 2026
b47421c
Remove commented-out dead code in ck_tile/ops/fused_moe/pipeline
AviralGoelAMD Apr 9, 2026
6eda2b7
Remove commented-out dead code in ck_tile/ops/gemm/block
AviralGoelAMD Apr 9, 2026
95c29f4
Remove commented-out dead code in ck_tile/ops/norm_reduce/block
AviralGoelAMD Apr 9, 2026
0a963d8
Remove commented-out dead code in ck_tile/ops/reduce/block
AviralGoelAMD Apr 9, 2026
6d1dc42
Remove commented-out dead code in lib/src/tensor_operation_instance/g…
AviralGoelAMD Apr 9, 2026
385da91
Remove commented-out dead code in profiler/src
AviralGoelAMD Apr 9, 2026
95a5e57
Remove commented-out dead code in test/block_swizzle_test
AviralGoelAMD Apr 9, 2026
604081c
Remove 4 orphaned files with verified replacements
AviralGoelAMD Apr 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ struct Epilogue
input_left_pads,
input_right_pads);

// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);

// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ struct Epilogue
input_left_pads,
input_right_pads);

// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);

// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ struct Epilogue
input_left_pads,
input_right_pads);

// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);

// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,6 @@ struct Epilogue
input_left_pads,
input_right_pads);

// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);

// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,16 +238,6 @@ int main(int argc, char* argv[])

Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));

#if 0
for(int n = 0; n < N; ++n)
{
for(int k = 0; k < K; ++k)
{
b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
}
}
#endif

using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
B0DataType,
CShuffleDataType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,9 +252,6 @@ int main(int argc, char* argv[])
Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
// max_token_id.mData[0] = valid_size;
// max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
// int eids[] = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
// int eids[] = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
for(int i = 0; i < sorted_tile_num; i++)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,16 +261,6 @@ int main(int argc, char* argv[])
Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));

max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
// int eids[] = {0, 1, 3, 3, 3};
// int eids[] = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
// int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
// int eids[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
// 5, 5, 5, 5, 6, 6, 6, 6, 7, 7,
// 7, 7,
// 3, 3, 3};
for(int i = 0; i < sorted_tile_num; i++)
{
expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd x 3p
#if 0
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 8, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 4, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 2, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16, 4, 64, 1, true , true, false>>(const S&, A);

template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 1, 1, 256, 4, true , true, false>>(const S&, A);
#endif

template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2, 128, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2, 128, 4, true, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd x 3p
#if 0
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 2, 4, 64, 8, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 4, 4, 64, 4, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 8, 4, 64, 2, true , true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16, 4, 64, 1, true , true, false>>(const S&, A);

template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 1, 1, 256, 4, true , true, false>>(const S&, A);
#endif

template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 1, 2, 128, 8, true, true, false>>(const S&, A);
template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 2, 128, 4, true, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd 2p
#if 0
template float smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 8, true, false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 4, true, false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 2, true, false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16, 4, 64, 1, true, false>>(const S&, A);

template float smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 1, 256, 4, true, false>>(const S&, A);
#endif

template float smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 2, 128, 8, true, false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 2, 128, 4, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd 2p
#if 0
template float smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 4, 64, 8, true ,false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::fp16_t, 1, 4, 4, 64, 4, true ,false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::fp16_t, 1, 8, 4, 64, 2, true ,false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16, 4, 64, 1, true ,false>>(const S&, A);

template float smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1, 256, 4, true ,false>>(const S&, A);
#endif

template float smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 2, 128, 8, true, false>>(const S&, A);
template float smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 2, 128, 4, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd 2p
#if 0
template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 2, 4, 64, 8, true, false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 4, 4, 64, 4, true, false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 8, 4, 64, 2, true, false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16, 4, 64, 1, true, false>>(const S&, A);

template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 1, 1, 256, 4, true, false>>(const S&, A);
#endif

template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2, 128, 8, true, false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2, 128, 4, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,6 @@

// clang-format off
// rm rn tm tn vn pd 2p
#if 0
template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 2, 4, 64, 8, true ,false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 4, 4, 64, 4, true ,false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 8, 4, 64, 2, true ,false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16, 4, 64, 1, true ,false>>(const S&, A);

template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 1, 1, 256, 4, true ,false>>(const S&, A);
#endif

template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 1, 2, 128, 8, true, false>>(const S&, A);
template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t, 1, 2, 2, 128, 4, true, false>>(const S&, A);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -476,16 +476,6 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
hip_check_error(hipGetLastError());
// end real kernel

// hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
// hip_check_error(hipEventSynchronize(stop));
// float cur_time = 0;
// hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
// #if MEDIAN
// times.insert(cur_time);
// #else
// total_time += cur_time;
// #endif

#if !defined(CK_USE_WMMA)
if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,6 @@ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));

// const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
// out_n_do_ho_wo_k_grid_desc,
// make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
// make_pass_through_transform(K)),
// make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}),
// make_tuple(Sequence<0>{}, Sequence<1>{}));

return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1,
wei_grid_desc_gemmk0_gemmn_gemmk1,
out_grid_desc_gemmm_gemmn);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,32 +60,6 @@ constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector()
NRepeat,
KPack>{};
}
#if 0
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
{
return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2<
BlkGemmPipeSche,
BlockSize,
ADataType,
BDataType,
ComputeDataType,
AccDataType,
ATileDesc,
BTileDesc,
AMmaTileDesc,
BMmaTileDesc,
ABlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector,
MPerBlock,
NPerBlock,
KPerBlock,
MPerXDL,
NPerXDL,
MRepeat,
NRepeat,
KPack>{};
}
#endif
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
{
static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,32 +93,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
KPack>{};
}
}
#if 0
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
{
return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2<
BlkGemmPipeSche,
BlockSize,
ADataType,
BDataType,
ComputeDataType,
AccDataType,
ATileDesc,
BTileDesc,
AMmaTileDesc,
BMmaTileDesc,
ABlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector,
MPerBlock,
NPerBlock,
KPerBlock,
MPerXDL,
NPerXDL,
MRepeat,
NRepeat,
KPack>{};
}
#endif
else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
{
static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,6 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
"When loading more than one element per thread at once, the contiguous "
"dimension must be the same between source and destination.");

// constexpr auto dword_bytes = 4;
// constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
// static_assert(bytes_per_thread_load == dword_bytes,
// "Direct load transfer requires each thread to load exactly a single "
// "DWORD of data.");

static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
nDim == ThreadClusterLengths::Size(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,6 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
"When loading more than one element per thread at once, the contiguous "
"dimension must be the same between source and destination.");

// constexpr auto dword_bytes = 4;
// constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
// static_assert(bytes_per_thread_load == dword_bytes,
// "Direct load transfer requires each thread to load exactly a single "
// "DWORD of data.");

static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
nDim == ThreadClusterLengths::Size(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -737,11 +737,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle

// Batch Offset
ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;

// for checking vector load/store
// index_t MRaw_;
// index_t NRaw_;
// index_t KRaw_;
};

// Invoker
Expand Down
Loading
Loading