Skip to content
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.swp
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ find_package(HIP REQUIRED)
message(STATUS "Build with HIP ${hip_VERSION}")

## half
#find_path(HALF_INCLUDE_DIR half.hpp)
find_path(HALF_INCLUDE_DIR half.hpp)
message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")

# CMAKE_CXX_FLAGS
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#ifndef CK_TRANSFORM_INTO_CONV_OUTPUT_HPP
#define CK_TRANSFORM_INTO_CONV_OUTPUT_HPP

#include "multi_index_transform_helper.hpp"
#include "common_header.hpp"
#include "tensor_descriptor.hpp"

namespace ck {

/*
* This functors are used to fuse convolution with some other operators. For example,
* in order to fuse conv + depth2space, the output of depth2space has to be transformed
* into the output of convolution.
*
* TODO: Use universal reference parameter in functor operators?
*/

template < index_t BlockSize>
struct TransformDepth2SpaceToConvolution_nhwc;

struct NoTransform
{
template <typename... DescArgs>
__host__ __device__ constexpr auto operator () (
const TensorDescriptor<DescArgs...>& conv_out)
{
return conv_out;
}
};

template <>
struct TransformDepth2SpaceToConvolution_nhwc<1>
{
template <typename... DescArgs>
__host__ __device__ constexpr auto operator () (
const TensorDescriptor<DescArgs...>& conv_out)
{
return conv_out;
}

};

template <index_t BlockSize>
struct TransformDepth2SpaceToConvolution_nhwc
{
template <typename... DescArgs>
__host__ __device__ constexpr auto operator () (
const TensorDescriptor<DescArgs...>& depth2space_n_hobs_wobs_c_desc)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};

const auto N = depth2space_n_hobs_wobs_c_desc.GetLength(I0);
const auto HoBs = depth2space_n_hobs_wobs_c_desc.GetLength(I1);
const auto WoBs = depth2space_n_hobs_wobs_c_desc.GetLength(I2);
const auto C = depth2space_n_hobs_wobs_c_desc.GetLength(I3);
assert(HoBs % BlockSize == 0);
assert(WoBs % BlockSize == 0);
const auto Ho = HoBs / BlockSize;
const auto Wo = WoBs / BlockSize;

const auto depth2space_n_ho_wo_b0_b1_c_desc = transform_tensor_descriptor(
depth2space_n_hobs_wobs_c_desc,
make_tuple(make_pass_through_transform(N),
make_unmerge_transform(make_tuple(Ho, BlockSize)),
make_unmerge_transform(make_tuple(Wo, BlockSize)),
make_pass_through_transform(C)),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}, Sequence<5>{}));

const auto conv_out_n_ho_wo_k_desc = transform_tensor_descriptor(
depth2space_n_ho_wo_b0_b1_c_desc,
make_tuple(make_pass_through_transform(N),
make_pass_through_transform(Ho),
make_pass_through_transform(Wo),
make_merge_transform(make_tuple(BlockSize, BlockSize, C))),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));

assert(conv_out_n_ho_wo_k_desc.GetLength(I0) == N);
assert(conv_out_n_ho_wo_k_desc.GetLength(I1)*BlockSize == HoBs);
assert(conv_out_n_ho_wo_k_desc.GetLength(I2)*BlockSize == WoBs);
assert(conv_out_n_ho_wo_k_desc.GetLength(I3) == C*BlockSize*BlockSize);

return conv_out_n_ho_wo_k_desc;
}
};

} // namespace ck
#endif

Original file line number Diff line number Diff line change
Expand Up @@ -197,14 +197,14 @@ struct TensorDescriptor
printf("{");
printf("TensorDescriptor, ");
static_for<0, ntransform_, 1>{}([&](auto i) {
printf("transforms: ");
printf("\ntransforms: \n");
transforms_[i].Print();
printf("LowerDimensionIds:");
printf("\nLowerDimensionIds:\n");
LowerDimensionIdss{}.At(i).Print();
printf("UpperDimensionIds:");
printf("\nUpperDimensionIds:\n");
UpperDimensionIdss{}.At(i).Print();
});
printf("}");
printf("}\n");

VisibleDimensionIds::Print();
}
Expand Down
3 changes: 3 additions & 0 deletions host/driver_offline/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@ set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
set(DEPTH2SPACE_CONV src/conv_and_depth2space_driver_offline.cpp)

add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
add_executable(depth2space ${DEPTH2SPACE_CONV})

target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
target_link_libraries(depth2space PRIVATE host_tensor)
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "transform_into_conv_output.hpp"
#include "driver_gemm_dlops_v1r3.hpp"

template <typename TInWei,
Expand All @@ -13,11 +14,12 @@ template <typename TInWei,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
typename InRightPads,
typename TransformToConvOutput=ck::NoTransform>
void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
const InLengths& in_n_hi_wi_c_lengths,
const WeiLengths& wei_k_y_x_c_lengths,
const OutLengths& out_n_ho_wo_k_lengths,
const OutLengths& out_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
Expand Down Expand Up @@ -46,7 +48,24 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(

const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
// const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
const auto out_n_ho_wo_k_desc = TransformToConvOutput{}(make_naive_tensor_descriptor_packed(out_lengths));

// check tensor shape of convolution output
const auto N = in_n_hi_wi_c_lengths[I0];
const auto Hi = in_n_hi_wi_c_lengths[I1];
const auto Wi = in_n_hi_wi_c_lengths[I2];
const auto K = wei_k_y_x_c_lengths[I0];
const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
const index_t YEff = (Y - 1) * conv_dilations[I0] + 1;
const index_t XEff = (X - 1) * conv_dilations[I1] + 1;
const auto Ho = (Hi + in_left_pads[I0] + in_right_pads[I0] - YEff) / conv_strides[I0] + 1;
const auto Wo = (Wi + in_left_pads[I1] + in_right_pads[I1] - XEff) / conv_strides[I1] + 1;
assert(N == out_n_ho_wo_k_desc.GetLength(I0));
assert(Ho == out_n_ho_wo_k_desc.GetLength(I1));
assert(Wo == out_n_ho_wo_k_desc.GetLength(I2));
assert(K == out_n_ho_wo_k_desc.GetLength(I3));

#if 0
// [M, N, K0, K1] = [128, 128, 8, 1] for fp32
Expand Down Expand Up @@ -250,15 +269,15 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
nrepeat);

{
const auto N = out_n_ho_wo_k_lengths[I0];
const auto K = out_n_ho_wo_k_lengths[I3];
// const auto N = in_n_hi_wi_c_lengths[I0];
// const auto K = wei_k_y_x_c_lengths[I0];
const auto C = wei_k_y_x_c_lengths[I3];

const auto Ho = out_n_ho_wo_k_lengths[I1];
const auto Wo = out_n_ho_wo_k_lengths[I2];

const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
// const auto Ho = out_n_ho_wo_k_lengths[I1];
// const auto Wo = out_n_ho_wo_k_lengths[I2];
//
// const auto Y = wei_k_y_x_c_lengths[I1];
// const auto X = wei_k_y_x_c_lengths[I2];

float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "device.hpp"
#include "host_tensor.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "transform_into_conv_output.hpp"
#include "driver_gemm_xdlops_v2r3.hpp"

template <typename TInWei,
Expand All @@ -13,11 +14,12 @@ template <typename TInWei,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
typename InRightPads,
typename TransformToConvOutput=ck::NoTransform>
void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
const InLengths& in_n_hi_wi_c_lengths,
const WeiLengths& wei_k_y_x_c_lengths,
const OutLengths& out_n_ho_wo_k_lengths,
const OutLengths& out_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
Expand Down Expand Up @@ -46,7 +48,23 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(

const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
const auto out_n_ho_wo_k_desc = TransformToConvOutput{}(make_naive_tensor_descriptor_packed(out_lengths));

// check tensor shape of convolution output
const auto N = in_n_hi_wi_c_lengths[I0];
const auto Hi = in_n_hi_wi_c_lengths[I1];
const auto Wi = in_n_hi_wi_c_lengths[I2];
const auto K = wei_k_y_x_c_lengths[I0];
const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
const index_t YEff = (Y - 1) * conv_dilations[I0] + 1;
const index_t XEff = (X - 1) * conv_dilations[I1] + 1;
const auto Ho = (Hi + in_left_pads[I0] + in_right_pads[I0] - YEff) / conv_strides[I0] + 1;
const auto Wo = (Wi + in_left_pads[I1] + in_right_pads[I1] - XEff) / conv_strides[I1] + 1;
assert(N == out_n_ho_wo_k_desc.GetLength(I0));
assert(Ho == out_n_ho_wo_k_desc.GetLength(I1));
assert(Wo == out_n_ho_wo_k_desc.GetLength(I2));
assert(K == out_n_ho_wo_k_desc.GetLength(I3));

#if 0
// [M, N, K0, K1] = [256, 128, 4, 4] for fp32
Expand Down Expand Up @@ -331,15 +349,15 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
nrepeat);

{
const auto N = out_n_ho_wo_k_lengths[I0];
const auto K = out_n_ho_wo_k_lengths[I3];
// const auto N = out_n_ho_wo_k_lengths[I0];
// const auto K = out_n_ho_wo_k_lengths[I3];
const auto C = wei_k_y_x_c_lengths[I3];

const auto Ho = out_n_ho_wo_k_lengths[I1];
const auto Wo = out_n_ho_wo_k_lengths[I2];

const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
// const auto Ho = out_n_ho_wo_k_lengths[I1];
// const auto Wo = out_n_ho_wo_k_lengths[I2];
//
// const auto Y = wei_k_y_x_c_lengths[I1];
// const auto X = wei_k_y_x_c_lengths[I2];

float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
(std::size_t(1000) * 1000 * 1000) / ave_time;
Expand Down
Loading