Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
f294808
Added two kernel for M=32 problem
Dec 20, 2024
e5bc56a
Comment the first one
Dec 20, 2024
1fcd332
Enable multiply_multiply for Scale_Block_M = 1 for deepseek
Dec 23, 2024
f728087
Modify the a_thread offset since the A data load is different from B.
Dec 25, 2024
988478d
edit fp8 ab scale for Scale_Block_M=1
junhaha666 Dec 26, 2024
d58d55e
edit GemmSpec to MNKPadding
junhaha666 Dec 26, 2024
9dac971
enable blockwise pipelie v1 and v2. v1 is work for small K.
Jan 10, 2025
363b674
add instance for gemm_ab_scale
Jan 14, 2025
7ae141f
fix cmakelist of ckProfiler
Jan 14, 2025
3df24f0
Merge branch 'develop' of https://github.com/ROCm/composable_kernel i…
aska-0096 Feb 18, 2025
3d4ad53
optimize blockscale gemm. todo: reduce vgpr usage
aska-0096 Feb 19, 2025
b9a97f4
fix a correctness bug
aska-0096 Feb 21, 2025
dd6d879
sanity checked
aska-0096 Feb 25, 2025
00c5f0f
revert ckprofiler cmake changes
aska-0096 Feb 25, 2025
da2f9e0
Merge branch 'develop' of https://github.com/ROCm/composable_kernel i…
aska-0096 Feb 25, 2025
2367a4f
clang format
aska-0096 Feb 25, 2025
4d56921
revert unnecessary changes.
aska-0096 Feb 25, 2025
41fab2d
remove commented codes.
aska-0096 Feb 25, 2025
f9c8b0d
Merge branch 'develop' of https://github.com/ROCm/composable_kernel i…
aska-0096 Mar 5, 2025
facfab9
Merge branch 'develop' of https://github.com/ROCm/composable_kernel i…
aska-0096 Mar 6, 2025
2a99ba2
split weight preshuffle library targets
aska-0096 Mar 6, 2025
d24e6d5
Merge branch 'develop' of https://github.com/ROCm/composable_kernel i…
aska-0096 Mar 7, 2025
2bacba7
bring back enable-post-misched=0
aska-0096 Mar 7, 2025
d58656f
fix build issues for gemm_multiply_multiply_fp8 instances
illsilin Mar 7, 2025
7e6f3c8
fix clang format
illsilin Mar 7, 2025
5829310
add verbose build flag when building for all targets
illsilin Mar 8, 2025
d11ad3e
reduce path names for new instances
illsilin Mar 10, 2025
fc7c80c
fix paths in cmake
illsilin Mar 10, 2025
2ab4835
refactor gemm_multiply_multiply library target
aska-0096 Mar 10, 2025
18133fd
fix a bug in example
aska-0096 Mar 10, 2025
8b88d83
fix example 65 cmake
illsilin Mar 10, 2025
6e35396
reduce the number of threads when building libs for all targets to 50
illsilin Mar 10, 2025
7197432
use ninja to build for all targets
illsilin Mar 11, 2025
c4dd991
reduce teh number of threads when building for all targets
illsilin Mar 11, 2025
4cce6db
reduce the number of threads to 32 when building libs for all targets…
illsilin Mar 11, 2025
f9fded6
Merge branch 'develop' into f8blockscale_opt
illsilin Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -1145,11 +1145,11 @@ pipeline {
}
agent{ label rocmnode("gfx90a") }
environment{
execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER="${build_compiler()}" \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
-D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j32 """
}
steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ using CDEElementOp = PassThrough;

static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;

static constexpr ck::index_t Scale_Block_M = 128;
static constexpr ck::index_t Scale_Block_M = 1;
static constexpr ck::index_t Scale_Block_N = 128;
static constexpr ck::index_t Scale_Block_K = 128;

Expand All @@ -65,26 +65,27 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_
A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CDEElementOp, GemmSpec,
256, Scale_Block_M, Scale_Block_N, Scale_Block_K,
128, 128,
128, 16, 16,
16, 128,
256, 16, 16,
16, 16,
4, 4,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
1, 2, S<1, 32, 1, 8>, S<8, 8, 1>,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
1, 2,
S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
1, 2, S<1, 16, 1, 16>, S<8>,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
// clang-format on

int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
bool flush_cache = true;

// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t M = 128;
ck::index_t N = 1024;
ck::index_t K = 1024;

ck::index_t StrideA = K;
ck::index_t StrideB = K;
Expand All @@ -100,7 +101,7 @@ int main(int argc, char* argv[])
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
else if(argc == 8)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
Expand All @@ -110,16 +111,19 @@ int main(int argc, char* argv[])
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);

StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideE = std::stoi(argv[9]);
flush_cache = std::stoi(argv[7]);

StrideA = K;
StrideB = K;
StrideE = N;
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
printf("arg4 to 6: M, N, K\n");
printf("arg7: flush both I$ and L2$ (0=no, 1=yes)\n");
exit(0);
}

Expand Down Expand Up @@ -182,9 +186,15 @@ int main(int argc, char* argv[])
b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
break;
case 4:
a0_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{});
b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{});
break;
case 5:
a0_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 2});
b0_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{});
b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
break;
default:
Expand All @@ -194,6 +204,16 @@ int main(int argc, char* argv[])
b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
}
#endif
#if 0
for(int im =0; im< (M + Scale_Block_M - 1) / Scale_Block_M; im++){
float row_sum = .0;
for(int ik =0; ik< (K + Scale_Block_K - 1) / Scale_Block_K; ik++){
printf("%lf ",a1_m_k(im, ik));
row_sum += a1_m_k(im, ik);
}
printf("sum: %lf\n", row_sum * 128);
}
#endif

DeviceMem a0_device_buf(sizeof(A0DataType) * a0_m_k.mDesc.GetElementSpaceSize());
DeviceMem a1_device_buf(sizeof(A1DataType) * a1_m_k.mDesc.GetElementSpaceSize());
Expand Down Expand Up @@ -239,12 +259,24 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}

float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50});

std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;

float ave_time = .0;

if(flush_cache)
{
int rotating_buf = (512 * 1024 * 1024 + num_btype - 1) / num_btype;

ave_time = invoker.Run(argument,
StreamConfig{nullptr, time_kernel, 0, 50, 100, true, rotating_buf});
}
else
{
ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 100});
}

float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

float gb_per_sec = num_btype / 1.E6 / ave_time;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,14 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
// clang-format off
< Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
AElementOp, BElementOp, CDEElementOp, GemmSpec, 256,
256, 256, 128,
128, 128, 128,
16, 16,
16, 16,
8, 8,
32, 32,
2, 2,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
1, 2, S<1, 32, 1, 8>, S<8, 8, 1>,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
1, 1, S<1, 32, 1, 8>, S<8, 8, 1>,
ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
// clang-format on

int main(int argc, char* argv[])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
// latency
// __builtin_amdgcn_sched_barrier(0);
}
else
else if constexpr(TailNum == TailNumber::Odd)
{
static_for<0, MRepeat, 1>{}([&](auto m0) {
static_for<0, NRepeat, 1>{}([&](auto n0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::I
// latency
// __builtin_amdgcn_sched_barrier(0);
}
else
else if constexpr(TailNum == TailNumber::Odd)
{
static_for<0, MRepeat, 1>{}([&](auto m0) {
static_for<0, KRepeat, 1>{}([&](auto k0) {
Expand Down
Loading