ROCm · j4yan · Sep 20, 2021 · Sep 20, 2021 · Sep 20, 2021 · Sep 21, 2021
@@ -0,0 +1 @@
+*.swp
@@ -39,7 +39,7 @@ find_package(HIP REQUIRED)
 message(STATUS "Build with HIP ${hip_VERSION}")
 
 ## half
-#find_path(HALF_INCLUDE_DIR half.hpp)
+find_path(HALF_INCLUDE_DIR half.hpp)
 message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
 
 # CMAKE_CXX_FLAGS

@@ -0,0 +1,93 @@
+#ifndef CK_TRANSFORM_INTO_CONV_OUTPUT_HPP
+#define CK_TRANSFORM_INTO_CONV_OUTPUT_HPP
+
+#include "multi_index_transform_helper.hpp"
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+
+namespace ck {
+
+/*
+ * This functors are used to fuse convolution with some other operators. For example, 
+ * in order to fuse conv + depth2space, the output of depth2space has to be transformed
+ * into the output of convolution.
+ *
+ * TODO: Use universal reference parameter in functor operators?
+ */
+
+template < index_t BlockSize>
+struct TransformDepth2SpaceToConvolution_nhwc;
+
+struct NoTransform
+{
+    template <typename... DescArgs>
+    __host__ __device__ constexpr auto operator () (
+    const TensorDescriptor<DescArgs...>& conv_out)
+    {
+        return conv_out;
+    }
+};
+
+template <>
+struct TransformDepth2SpaceToConvolution_nhwc<1>
+{
+    template <typename... DescArgs>
+    __host__ __device__ constexpr auto operator () (
+    const TensorDescriptor<DescArgs...>& conv_out)
+    {
+        return conv_out;
+    }
+
+};
+
+template <index_t BlockSize>
+struct TransformDepth2SpaceToConvolution_nhwc
+{
+    template <typename... DescArgs>
+    __host__ __device__ constexpr auto operator () (
+    const TensorDescriptor<DescArgs...>& depth2space_n_hobs_wobs_c_desc)
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto N = depth2space_n_hobs_wobs_c_desc.GetLength(I0);
+        const auto HoBs = depth2space_n_hobs_wobs_c_desc.GetLength(I1);
+        const auto WoBs = depth2space_n_hobs_wobs_c_desc.GetLength(I2);
+        const auto C = depth2space_n_hobs_wobs_c_desc.GetLength(I3);
+        assert(HoBs % BlockSize == 0);
+        assert(WoBs % BlockSize == 0);
+        const auto Ho = HoBs / BlockSize;
+        const auto Wo = WoBs / BlockSize;
+
+        const auto depth2space_n_ho_wo_b0_b1_c_desc = transform_tensor_descriptor(
+            depth2space_n_hobs_wobs_c_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(Ho, BlockSize)),
+                       make_unmerge_transform(make_tuple(Wo, BlockSize)),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}, Sequence<5>{}));
+
+        const auto conv_out_n_ho_wo_k_desc = transform_tensor_descriptor(
+            depth2space_n_ho_wo_b0_b1_c_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pass_through_transform(Ho),
+                       make_pass_through_transform(Wo),
+                       make_merge_transform(make_tuple(BlockSize, BlockSize, C))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        assert(conv_out_n_ho_wo_k_desc.GetLength(I0) == N);
+        assert(conv_out_n_ho_wo_k_desc.GetLength(I1)*BlockSize == HoBs);
+        assert(conv_out_n_ho_wo_k_desc.GetLength(I2)*BlockSize == WoBs);
+        assert(conv_out_n_ho_wo_k_desc.GetLength(I3) == C*BlockSize*BlockSize);
+
+        return conv_out_n_ho_wo_k_desc;
+    }
+};
+
+} // namespace ck
+#endif
+
@@ -197,14 +197,14 @@ struct TensorDescriptor
         printf("{");
         printf("TensorDescriptor, ");
         static_for<0, ntransform_, 1>{}([&](auto i) {
-            printf("transforms: ");
+            printf("\ntransforms: \n");
             transforms_[i].Print();
-            printf("LowerDimensionIds:");
+            printf("\nLowerDimensionIds:\n");
             LowerDimensionIdss{}.At(i).Print();
-            printf("UpperDimensionIds:");
+            printf("\nUpperDimensionIds:\n");
             UpperDimensionIdss{}.At(i).Print();
         });
-        printf("}");
+        printf("}\n");
 
         VisibleDimensionIds::Print();
     }

@@ -15,13 +15,16 @@ set(CONV_FWD_DRIVER_OFFLINE_SOURCE src/conv_fwd_driver_offline.cpp)
 set(CONV_BWD_DRIVER_OFFLINE_SOURCE src/conv_bwd_driver_offline.cpp)
 set(CONV_WRW_DRIVER_OFFLINE_SOURCE src/conv_wrw_driver_offline.cpp)
 set(GEMM_DRIVER_OFFLINE_SOURCE src/gemm_driver_offline.cpp)
+set(DEPTH2SPACE_CONV src/conv_and_depth2space_driver_offline.cpp)
 
 add_executable(conv_fwd_driver_offline ${CONV_FWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_bwd_driver_offline ${CONV_BWD_DRIVER_OFFLINE_SOURCE})
 add_executable(conv_wrw_driver_offline ${CONV_WRW_DRIVER_OFFLINE_SOURCE})
 add_executable(gemm_driver_offline ${GEMM_DRIVER_OFFLINE_SOURCE})
+add_executable(depth2space ${DEPTH2SPACE_CONV})
 
 target_link_libraries(conv_fwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_bwd_driver_offline PRIVATE host_tensor)
 target_link_libraries(conv_wrw_driver_offline PRIVATE host_tensor)
 target_link_libraries(gemm_driver_offline PRIVATE host_tensor)
+target_link_libraries(depth2space PRIVATE host_tensor)
@@ -2,6 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "transform_into_conv_output.hpp"
 #include "driver_gemm_dlops_v1r3.hpp"
 
 template <typename TInWei,
@@ -13,11 +14,12 @@ template <typename TInWei,
           typename ConvStrides,
           typename ConvDilations,
           typename InLeftPads,
-          typename InRightPads>
+          typename InRightPads,
+          typename TransformToConvOutput=ck::NoTransform>
 void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
+    const OutLengths& out_lengths,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -46,7 +48,24 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
 
     const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
     const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+    // const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+    const auto out_n_ho_wo_k_desc = TransformToConvOutput{}(make_naive_tensor_descriptor_packed(out_lengths));
+
+    // check tensor shape of convolution output
+    const auto N = in_n_hi_wi_c_lengths[I0];
+    const auto Hi = in_n_hi_wi_c_lengths[I1];
+    const auto Wi = in_n_hi_wi_c_lengths[I2];
+    const auto K = wei_k_y_x_c_lengths[I0];
+    const auto Y = wei_k_y_x_c_lengths[I1];
+    const auto X = wei_k_y_x_c_lengths[I2];
+    const index_t YEff = (Y - 1) * conv_dilations[I0] + 1;
+    const index_t XEff = (X - 1) * conv_dilations[I1] + 1;
+    const auto Ho = (Hi + in_left_pads[I0] + in_right_pads[I0] - YEff) / conv_strides[I0] + 1;
+    const auto Wo = (Wi + in_left_pads[I1] + in_right_pads[I1] - XEff) / conv_strides[I1] + 1;
+    assert(N == out_n_ho_wo_k_desc.GetLength(I0));
+    assert(Ho == out_n_ho_wo_k_desc.GetLength(I1));
+    assert(Wo == out_n_ho_wo_k_desc.GetLength(I2));
+    assert(K == out_n_ho_wo_k_desc.GetLength(I3));
 
 #if 0
     // [M, N, K0, K1] = [128, 128, 8, 1] for fp32
@@ -250,15 +269,15 @@ void device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk(
             nrepeat);
 
         {
-            const auto N = out_n_ho_wo_k_lengths[I0];
-            const auto K = out_n_ho_wo_k_lengths[I3];
+            // const auto N = in_n_hi_wi_c_lengths[I0];
+            // const auto K = wei_k_y_x_c_lengths[I0];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Ho = out_n_ho_wo_k_lengths[I1];
-            const auto Wo = out_n_ho_wo_k_lengths[I2];
-
-            const auto Y = wei_k_y_x_c_lengths[I1];
-            const auto X = wei_k_y_x_c_lengths[I2];
+            // const auto Ho = out_n_ho_wo_k_lengths[I1];
+            // const auto Wo = out_n_ho_wo_k_lengths[I2];
+            //
+            // const auto Y = wei_k_y_x_c_lengths[I1];
+            // const auto X = wei_k_y_x_c_lengths[I2];
 
             float perf = static_cast<float>(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;

@@ -2,6 +2,7 @@
 #include "device.hpp"
 #include "host_tensor.hpp"
 #include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
+#include "transform_into_conv_output.hpp"
 #include "driver_gemm_xdlops_v2r3.hpp"
 
 template <typename TInWei,
@@ -13,11 +14,12 @@ template <typename TInWei,
           typename ConvStrides,
           typename ConvDilations,
           typename InLeftPads,
-          typename InRightPads>
+          typename InRightPads,
+          typename TransformToConvOutput=ck::NoTransform>
 void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
     const InLengths& in_n_hi_wi_c_lengths,
     const WeiLengths& wei_k_y_x_c_lengths,
-    const OutLengths& out_n_ho_wo_k_lengths,
+    const OutLengths& out_lengths,
     const ConvStrides& conv_strides,
     const ConvDilations& conv_dilations,
     const InLeftPads& in_left_pads,
@@ -46,7 +48,23 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
 
     const auto in_n_hi_wi_c_desc  = make_naive_tensor_descriptor_packed(in_n_hi_wi_c_lengths);
     const auto wei_k_y_x_c_desc   = make_naive_tensor_descriptor_packed(wei_k_y_x_c_lengths);
-    const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(out_n_ho_wo_k_lengths);
+    const auto out_n_ho_wo_k_desc = TransformToConvOutput{}(make_naive_tensor_descriptor_packed(out_lengths));
+
+    // check tensor shape of convolution output
+    const auto N = in_n_hi_wi_c_lengths[I0];
+    const auto Hi = in_n_hi_wi_c_lengths[I1];
+    const auto Wi = in_n_hi_wi_c_lengths[I2];
+    const auto K = wei_k_y_x_c_lengths[I0];
+    const auto Y = wei_k_y_x_c_lengths[I1];
+    const auto X = wei_k_y_x_c_lengths[I2];
+    const index_t YEff = (Y - 1) * conv_dilations[I0] + 1;
+    const index_t XEff = (X - 1) * conv_dilations[I1] + 1;
+    const auto Ho = (Hi + in_left_pads[I0] + in_right_pads[I0] - YEff) / conv_strides[I0] + 1;
+    const auto Wo = (Wi + in_left_pads[I1] + in_right_pads[I1] - XEff) / conv_strides[I1] + 1;
+    assert(N == out_n_ho_wo_k_desc.GetLength(I0));
+    assert(Ho == out_n_ho_wo_k_desc.GetLength(I1));
+    assert(Wo == out_n_ho_wo_k_desc.GetLength(I2));
+    assert(K == out_n_ho_wo_k_desc.GetLength(I3));
 
 #if 0
     // [M, N, K0, K1] = [256, 128, 4, 4] for fp32
@@ -331,15 +349,15 @@ void device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk(
               nrepeat);
 
         {
-            const auto N = out_n_ho_wo_k_lengths[I0];
-            const auto K = out_n_ho_wo_k_lengths[I3];
+            // const auto N = out_n_ho_wo_k_lengths[I0];
+            // const auto K = out_n_ho_wo_k_lengths[I3];
             const auto C = wei_k_y_x_c_lengths[I3];
 
-            const auto Ho = out_n_ho_wo_k_lengths[I1];
-            const auto Wo = out_n_ho_wo_k_lengths[I2];
-
-            const auto Y = wei_k_y_x_c_lengths[I1];
-            const auto X = wei_k_y_x_c_lengths[I2];
+            // const auto Ho = out_n_ho_wo_k_lengths[I1];
+            // const auto Wo = out_n_ho_wo_k_lengths[I2];
+            //
+            // const auto Y = wei_k_y_x_c_lengths[I1];
+            // const auto X = wei_k_y_x_c_lengths[I2];
 
             float perf = static_cast<float>((std::size_t(2) * N * K * Ho * Wo * C * Y * X)) /
                          (std::size_t(1000) * 1000 * 1000) / ave_time;