From 568fa25adaa3f2af50fb3d745269b7911ace6587 Mon Sep 17 00:00:00 2001 From: Aviral Goel Date: Thu, 9 Apr 2026 04:27:51 -0400 Subject: [PATCH 1/3] Add CK_CONTRACTION_INSTANCE macro and refactor bilinear-2D instances Introduce contraction_instance_common.hpp with a shared macro that generates both the using-alias and add_device_* registration function for contraction instances from 12 parameters. Refactor all 36 bilinear-2D instance files to use the macro, reducing each from 58 to 12 lines. Net savings: 1,656 lines. --- .../contraction_instance_common.hpp | 77 +++++++++++++++++++ ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 64 +++------------ ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 64 +++------------ ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 64 +++------------ ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 64 +++------------ ...ffle_bf16_bf16_bf16_bf16_kknn_instance.cpp | 64 +++------------ ...ffle_bf16_bf16_bf16_bf16_knnn_instance.cpp | 64 +++------------ ...ffle_bf16_bf16_bf16_bf16_mknn_instance.cpp | 64 +++------------ ...ffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp | 64 +++------------ ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 64 +++------------ ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 64 +++------------ ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 64 +++------------ ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 64 +++------------ ..._shuffle_f16_f16_f16_f16_kknn_instance.cpp | 64 +++------------ ..._shuffle_f16_f16_f16_f16_knnn_instance.cpp | 64 +++------------ ..._shuffle_f16_f16_f16_f16_mknn_instance.cpp | 64 +++------------ ..._shuffle_f16_f16_f16_f16_mnnn_instance.cpp | 64 +++------------ ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 64 +++------------ ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 64 +++------------ ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 64 +++------------ ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 64 +++------------ ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 64 +++------------ ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 64 +++------------ ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 64 +++------------ ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 64 +++------------ ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 64 +++------------ ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 64 +++------------ ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 64 +++------------ ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 64 +++------------ ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 64 +++------------ ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 64 +++------------ ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 64 +++------------ ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 64 +++------------ ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 64 +++------------ ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 64 +++------------ ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 64 +++------------ ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 64 +++------------ 37 files changed, 401 insertions(+), 1980 deletions(-) create mode 100644 projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp new file mode 100644 index 00000000000..e9f838107e0 --- /dev/null +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/contraction_instance_common.hpp @@ -0,0 +1,77 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +// Macro to generate a contraction device operation instance definition and its +// registration function. Each invocation produces one using-alias and one +// add_device_* function inside ck::tensor_operation::device::instance. +// +// Parameters: +// INST_TPL — instance template (e.g. device_contraction_kk_instance, +// device_contraction_f64_kk_instance) +// OP_NAME — lowercase operation name for identifier construction +// (bilinear or scale) +// CDE_OP — C++ element-wise operation type for template argument +// (Bilinear or Scale) +// NDIM_VAL — number of dimensions (2 or 6) +// NAME_SUFFIX — data-type and layout suffix for the generated names +// (e.g. f32_f32_f32_f32_kknn, bf16_bf16_bf16_bf16_compute_f32_knnn) +// ADATA — ADataType +// BDATA — BDataType +// ACC — AccDataType +// CSHUFFLE — CShuffleDataType +// DS_TUPLE — DsDataType (e.g. F32_Tuple, Empty_Tuple) +// EDATA — EDataType +// COMPUTE — ComputeDataType +// +// Example — bilinear, F32, kk layout, 2D: +// +// CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, +// bilinear, Bilinear, 2, f32_f32_f32_f32_kknn, +// F32, F32, F32, F32, F32_Tuple, F32, F32) +// +// Expands to: +// using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = ...; +// void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(...) +// { ... } +// +// clang-format off +#define CK_CONTRACTION_INSTANCE(INST_TPL, OP_NAME, CDE_OP, NDIM_VAL, \ + NAME_SUFFIX, ADATA, BDATA, ACC, CSHUFFLE, DS_TUPLE, EDATA, COMPUTE) \ + \ +namespace ck { \ +namespace tensor_operation { \ +namespace device { \ +namespace instance { \ + \ +using device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance = \ + INST_TPL; \ + \ +void add_device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance( \ + std::vector>>& instances) \ +{ \ + add_device_operation_instances(instances, \ + device_contraction_##OP_NAME##_m##NDIM_VAL##_n##NDIM_VAL##_k##NDIM_VAL##_xdl_c_shuffle_##NAME_SUFFIX##_instance{}); \ +} \ + \ +} /* namespace instance */ \ +} /* namespace device */ \ +} /* namespace tensor_operation */ \ +} /* namespace ck */ +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index c8f6053c44e..1a4ce88a392 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index fb1002f1aaf..cdfcab69afd 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index 5918beb9ad4..b1ca1603b4f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index fccd91e5be9..bd7f73d2ed8 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_compute_f32_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp index ce57ee2d079..964d2a06903 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp index e1e5dbb434d..ac8ac661e39 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp index db984063901..281673f6a8c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp index 5c7032e8547..3ac1cef7bee 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, bf16_bf16_bf16_bf16_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index 89cb35495b9..5b410c24a07 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index c25ebfb5987..9982149b2e0 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 9815d2f4e38..0b6f0a85899 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index c1735b1fe1a..a2092c8c5c3 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_compute_f32_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp index a0c8376980b..188a674c3f2 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp index 0798f7a9b60..e083e27460f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp index 7da83714824..8986de8f825 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp index 49267e08670..7a80a9e6f05 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f16_f16_f16_f16_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp index 008d5720af5..ddb619c3f8f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp index 9b927385ef8..e2abf1c0571 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp index a398194f64b..bc1965c9004 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp index 3726f97709f..4390179324d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_bf16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp index 41fa523b5f7..eae059b621b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp index 898c5a79cc1..b3a72e5f997 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp index 64db3364a3b..627489886dd 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp index ad548f38e73..8442ea8faee 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_compute_f16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index 3e36bfd30b4..9344bb06dec 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index b67121316b1..72bec728d91 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index 94228aa307a..7e4a69f634e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index 28184344c3e..9516290b234 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 2, f32_f32_f32_f32_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp index f2d107c37d1..2f7ddf0a38c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_kknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp index dcf8c05eda1..074035870f7 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_knnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp index fe2e1108e93..70e4a0ca807 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp index 420a1f07ebc..03d36ce10c7 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_compute_f32_mnnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index 1c5917cbc6f..a3e48e8fe0e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_kknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index 6b87fcf1d8f..b6391d36ed2 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_knnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index 03469cd96be..3a96d9c8a4b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_mknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index 5171a38dece..fc4f651f753 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/2D/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 2, f64_f64_f64_f64_mnnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on From 1a7c57c008c88f77deb8bac1a6abbc1ab061f56c Mon Sep 17 00:00:00 2001 From: Aviral Goel Date: Thu, 9 Apr 2026 04:30:42 -0400 Subject: [PATCH 2/3] Refactor remaining contraction instances to use shared macro Apply CK_CONTRACTION_INSTANCE macro to the remaining 108 instance files: bilinear-6D (36), scale-2D (36), scale-6D (36). Combined with the previous commit, all 144 contraction instance files now use the shared macro. Total savings: ~6,600 lines. --- ...16_bf16_bf16_compute_f32_kknn_instance.cpp | 64 +++--------------- ...16_bf16_bf16_compute_f32_knnn_instance.cpp | 64 +++--------------- ...16_bf16_bf16_compute_f32_mknn_instance.cpp | 64 +++--------------- ...16_bf16_bf16_compute_f32_mnnn_instance.cpp | 64 +++--------------- ...ffle_bf16_bf16_bf16_bf16_kknn_instance.cpp | 64 +++--------------- ...ffle_bf16_bf16_bf16_bf16_knnn_instance.cpp | 64 +++--------------- ...ffle_bf16_bf16_bf16_bf16_mknn_instance.cpp | 64 +++--------------- ...ffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp | 64 +++--------------- ..._f16_f16_f16_compute_f32_kknn_instance.cpp | 64 +++--------------- ..._f16_f16_f16_compute_f32_knnn_instance.cpp | 64 +++--------------- ..._f16_f16_f16_compute_f32_mknn_instance.cpp | 64 +++--------------- ..._f16_f16_f16_compute_f32_mnnn_instance.cpp | 64 +++--------------- ..._shuffle_f16_f16_f16_f16_kknn_instance.cpp | 66 +++---------------- ..._shuffle_f16_f16_f16_f16_knnn_instance.cpp | 64 +++--------------- ..._shuffle_f16_f16_f16_f16_mknn_instance.cpp | 64 +++--------------- ..._shuffle_f16_f16_f16_f16_mnnn_instance.cpp | 64 +++--------------- ...f32_f32_f32_compute_bf16_kknn_instance.cpp | 64 +++--------------- ...f32_f32_f32_compute_bf16_knnn_instance.cpp | 64 +++--------------- ...f32_f32_f32_compute_bf16_mknn_instance.cpp | 64 +++--------------- ...f32_f32_f32_compute_bf16_mnnn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_f16_kknn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_f16_knnn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_f16_mknn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_f16_mnnn_instance.cpp | 64 +++--------------- ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp | 64 +++--------------- ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp | 64 +++--------------- ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp | 64 +++--------------- ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp | 64 +++--------------- ..._f64_f64_f64_compute_f32_kknn_instance.cpp | 64 +++--------------- ..._f64_f64_f64_compute_f32_knnn_instance.cpp | 64 +++--------------- ..._f64_f64_f64_compute_f32_mknn_instance.cpp | 64 +++--------------- ..._f64_f64_f64_compute_f32_mnnn_instance.cpp | 64 +++--------------- ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 64 +++--------------- ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 64 +++--------------- ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 64 +++--------------- ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 64 +++--------------- ..._c_shuffle_bf16_bf16_bf16_kkn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_knn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_mkn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_mnn_instance.cpp | 63 +++--------------- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp | 63 +++--------------- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 63 +++--------------- ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 63 +++--------------- ...f16_bf16_bf16_compute_f32_kkn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_knn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_mkn_instance.cpp | 64 +++--------------- ...f16_bf16_bf16_compute_f32_mnn_instance.cpp | 64 +++--------------- ..._c_shuffle_bf16_bf16_bf16_kkn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_knn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_mkn_instance.cpp | 63 +++--------------- ..._c_shuffle_bf16_bf16_bf16_mnn_instance.cpp | 63 +++--------------- ...e_f16_f16_f16_compute_f32_kkn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_knn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_mkn_instance.cpp | 64 +++--------------- ...e_f16_f16_f16_compute_f32_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp | 63 +++--------------- ..._f32_f32_f32_compute_bf16_kkn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_knn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_mkn_instance.cpp | 64 +++--------------- ..._f32_f32_f32_compute_bf16_mnn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_kkn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_knn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_mkn_instance.cpp | 64 +++--------------- ...e_f32_f32_f32_compute_f16_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp | 63 +++--------------- ...e_f64_f64_f64_compute_f32_kkn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_knn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_mkn_instance.cpp | 64 +++--------------- ...e_f64_f64_f64_compute_f32_mnn_instance.cpp | 64 +++--------------- ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 63 +++--------------- ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 63 +++--------------- 108 files changed, 972 insertions(+), 5910 deletions(-) diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp index 961b78427f4..26e9a1801bc 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp index 5cd869249de..419b1ce339f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp index aa8ad904a5a..9b6490cfda1 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp index 80b4de60607..931820ecb8c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_compute_f32_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp index 77fae91ffee..35b76bb568c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_kknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp index 9b8cacc5e15..7a558ca4a89 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_knnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp index 50a76452564..020ac2ca394 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mknn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp index 78aa99fa6e5..c2132039278 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, bf16_bf16_bf16_bf16_mnnn, + BF16, BF16, F32, BF16, BF16_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp index 2342b0db676..0896074b15d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp index 130d56c5ca2..b9b7e22544e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp index 90222accc15..86affeec002 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp index 9b731a95cf4..2315f611682 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_compute_f32_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp index e738e54f069..dae7e5780a1 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance.cpp @@ -1,60 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance( - std::vector>>& instances) -{ - printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: before add, size=%zu\n", instances.size()); - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_kknn_instance{}); - printf("[CK_DEBUG] f16+f16+f16+f16_kknn_instance: after add, size=%zu\n", instances.size()); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_kknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp index 4bc5b1684a4..319f5a87de4 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_knnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp index e320fbe11a9..03739391cdf 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_mknn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp index bbb90a6af42..d40fcae6fff 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f16_f16_f16_f16_mnnn, + F16, F16, F32, F16, F16_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp index b95aa0d5ba2..36e8a19263f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp index e2f62c23425..8b3d2c6420b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp index 80b6b6ecf87..7c6a8b8d83b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp index 181ad86e1bb..8b08570f6c1 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_bf16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_bf16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp index 514da56a0f9..881436f505b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp index 61dda90cbca..6b2d7b14c50 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp index 301bde04b81..bb91b6879be 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp index 09dbdff021e..d35107af675 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_compute_f16_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_compute_f16_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp index fe7b520219f..f56045888af 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = - device_contraction_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_kknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp index c99a1439e18..5a591fb479b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance = - device_contraction_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_knnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp index 7ae0833b19a..42010cb957b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance = - device_contraction_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_mknn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp index f0cd251985d..ca015c306dc 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + bilinear, Bilinear, 6, f32_f32_f32_f32_mnnn, + F32, F32, F32, F32, F32_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp index a14b00a7f20..3254d2a5f1d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_kknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp index e7194022514..a2831f07608 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_knnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp index d093671e25e..cede3aa1a4c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mknn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp index 3e0ac565e26..bbee01fa581 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_compute_f32_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_compute_f32_mnnn, + F64, F64, F32, F64, F64_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp index c4c8cd13d57..c6fc9eecf3a 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_kknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp index 7e056c48243..4c0dabed1a6 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_knnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp index dd11af63b46..7154fa88016 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_mknn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp index 990e862e77c..bd24c620e30 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_bilinear/6D/device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_bilinear_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + bilinear, Bilinear, 6, f64_f64_f64_f64_mnnn, + F64, F64, F64, F64, F64_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index a3acedbcc40..a0ff8391d21 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index c5c365ec26c..bf5a255afde 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index 58ab346942e..8c26b797a76 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index 8c9f6fc57b9..c93b43da7bf 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, bf16_bf16_bf16_compute_f32_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp index c85f8cc998f..9d32d0eb459 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, bf16_bf16_bf16_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp index d4a25d40cbb..8474e996c2e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, bf16_bf16_bf16_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp index 7be8a0a694e..6c8c7ac837e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, bf16_bf16_bf16_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp index b2a4c020e67..e971273a2f7 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, bf16_bf16_bf16_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index 9a9d3e16fb6..8026a5f3b95 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index d158d5eb99d..69747495467 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index a263d0b8ca4..fb80ab9df1a 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index eb9fa3714e0..87f337c67f6 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f16_f16_f16_compute_f32_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp index 52042dd0451..e8de33728bc 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f16_f16_f16_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp index 2b6aed8ed4f..e87816b00fc 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f16_f16_f16_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp index 07cbbf87c68..2e13b536f2a 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f16_f16_f16_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp index 2cc4bfb718d..eccce81df91 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f16_f16_f16_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp index 50fe1a696f1..6464ffeddc4 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp index 6aab79f3126..26bf6075590 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp index e6f24424abd..e236ad71f4f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp index 60b760bfce1..3ccd1820e00 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_compute_bf16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp index 19992c96fd0..f60ef81681c 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp index a13e315e381..da0ffaf8f0b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp index 3b4aaa7a5b8..a1567d9c827 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp index 48e190574fa..098602f203a 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_compute_f16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index 1b8bceb65de..483b4eb8690 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 2, f32_f32_f32_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index a09ebae1dd9..71b17712b30 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 2, f32_f32_f32_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index 4172958f2af..91b6b1d9274 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 2, f32_f32_f32_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index c8c9ce43480..cbba0786e29 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 2, f32_f32_f32_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index bb44557ba80..dcd7cf50c4f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_kkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index 91c96bd6798..13ac1b4cbb7 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_knn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 0fe142fc59a..e012e157a7f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_mkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index 28d337d2463..5bda2368564 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 2, f64_f64_f64_compute_f32_mnn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index 39e29cd3e8e..8ab00c937c1 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 2, f64_f64_f64_kkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index ef4dd284e52..fb33d7d7619 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 2, f64_f64_f64_knn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 78effae8e27..571cea261e9 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 2, f64_f64_f64_mkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index 465a80b1b0a..9847c021d58 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/2D/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 2, f64_f64_f64_mnn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp index a472f793e44..134fca49365 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp index c4bddd6c6ec..062f8468f70 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp index 3a1c9c3fb91..c6b7784f279 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp index d23c0051910..30f483036a7 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, bf16_bf16_bf16_compute_f32_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp index 9244f6a132c..9118dba4f17 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, bf16_bf16_bf16_kkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp index 99e80e0e28d..713eff33cbf 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, bf16_bf16_bf16_knn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp index 77ca8c0d16b..1b78e11f70d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, bf16_bf16_bf16_mkn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp index 564fe537bb4..2a70c27f207 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_bf16_bf16_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, bf16_bf16_bf16_mnn, + BF16, BF16, F32, BF16, Empty_Tuple, BF16, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp index 69f074caf01..80bc1cbe724 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp index dbad11727c6..5564fcb64ff 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp index a53e7801ea0..19c73e48b85 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp index 977497d387a..1acb62c960d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f16_f16_f16_compute_f32_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp index dfc187562a3..28d2d84510f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f16_f16_f16_kkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp index 50d951a99c6..ba247621ff8 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f16_f16_f16_knn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp index 460c5c4b492..32d601c9b7f 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f16_f16_f16_mkn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp index bee17f3386b..fb66208b936 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f16_f16_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f16_f16_f16_mnn, + F16, F16, F32, F16, Empty_Tuple, F16, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp index 5f737132afc..c78f64bfcae 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp index 1dbebe89f79..fde6062baae 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp index 4c609db46aa..7d3ae3348ed 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp index 9005335eafd..899ba7aac5a 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_compute_bf16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, BF16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp index 4623b2e5d80..afc0c0a588b 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp index 952ad237a89..7d084a8b45e 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp index 8273c319b82..821bc2798fa 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp index cf22f7a729c..3fe62bb1174 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_compute_f16_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F16) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp index a4659d4d904..a2945335564 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance = - device_contraction_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kk_instance, + scale, Scale, 6, f32_f32_f32_kkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp index 72adf0f03db..fa38bc2ef82 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance = - device_contraction_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_kn_instance, + scale, Scale, 6, f32_f32_f32_knn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp index d70c2bb4c5f..5752bc169ab 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance = - device_contraction_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mk_instance, + scale, Scale, 6, f32_f32_f32_mkn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp index 7fa3458ab01..1cae73eb8a5 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance = - device_contraction_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_mn_instance, + scale, Scale, 6, f32_f32_f32_mnn, + F32, F32, F32, F32, Empty_Tuple, F32, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp index 877545e3382..1f171a14135 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_kkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp index df51431b233..66a8eae4279 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_knn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp index 3bbdf848658..9c5e9fd1bbc 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_mkn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp index 127c47c5a37..579e9559739 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp @@ -1,58 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, - device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 6, f64_f64_f64_compute_f32_mnn, + F64, F64, F32, F64, Empty_Tuple, F64, F32) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp index f05a685d177..c3357a6f910 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance = - device_contraction_f64_kk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kk_instance, + scale, Scale, 6, f64_f64_f64_kkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp index 34bc800fcf9..447db7fab4d 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// k/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance = - device_contraction_f64_kn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_kn_instance, + scale, Scale, 6, f64_f64_f64_knn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp index 180d1b52737..059689ff5e6 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/k/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance = - device_contraction_f64_mk_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mk_instance, + scale, Scale, 6, f64_f64_f64_mkn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp index bb6f5c66856..393b7ac6f34 100644 --- a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -1,57 +1,12 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -// This (ifndef) is a hack to use customized behavior for buffer load rather than using default -// setting Don't use this hack unless absolutely necessary! -// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op -#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 - -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] -// m/n/n/n are the fast changing dimension for A/B/D/E -using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance = - device_contraction_f64_mn_instance; - -void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance( - std::vector>>& instances) -{ - add_device_operation_instances( - instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck +#include "../../contraction/contraction_instance_common.hpp" + +// Instantiate contraction device operation and register via add_device_* function. +// See contraction_instance_common.hpp for macro definition and parameter documentation. +// clang-format off +CK_CONTRACTION_INSTANCE(device_contraction_f64_mn_instance, + scale, Scale, 6, f64_f64_f64_mnn, + F64, F64, F64, F64, Empty_Tuple, F64, F64) +// clang-format on From 30980381af636a2fa99c568c04360689a4d3995b Mon Sep 17 00:00:00 2001 From: Aviral Goel Date: Thu, 9 Apr 2026 22:11:45 -0400 Subject: [PATCH 3/3] Add CMakeLists.txt to contraction/ header-only directory The parent CMakeLists.txt globs all subdirectories (line 218: file(GLOB dir_list LIST_DIRECTORIES true *)) and attempts to file(READ .../contraction/CMakeLists.txt) for each. Without this file, CMake fails: file failed to open for reading: .../gpu/contraction/CMakeLists.txt The contraction/ directory is header-only (contraction_instance_common.hpp). Adding an empty CMakeLists.txt satisfies the glob/read requirement. The directory defines no targets, so add_subdirectory() processes it harmlessly. Fixes Jenkins CI failure on PR #6325. --- .../gpu/contraction/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt diff --git a/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt new file mode 100644 index 00000000000..cd0d93c5e90 --- /dev/null +++ b/projects/composablekernel/library/src/tensor_operation_instance/gpu/contraction/CMakeLists.txt @@ -0,0 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# This directory contains only shared header files (contraction_instance_common.hpp). +# There are no source files to compile here — the header is included by the +# contraction_bilinear/ and contraction_scale/ instance directories.