-
Notifications
You must be signed in to change notification settings - Fork 300
NHWC conv 2d: fwd bfp16/int8, Device level tuning and host API #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 7 commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
4297ee1
add fwd bf16 conv
ltqin 7a11971
change tunning parametor
ltqin 3b6d956
add int8 for conv fwd
ltqin 7c0da10
remove comments
ltqin 8438d93
change tunning parametor for int8
ltqin 4fb8be7
change init int8 example
ltqin 01f13b7
add test for conv2d fwd
ltqin d49e71c
Merge branch 'develop' into ck_conv_fwd_bf16
ltqin 525cb46
change device operation file pos because merge develop
ltqin feda990
fwd int8 use reference
ltqin 2647dd6
test_conv_fwd use reference
ltqin 430d5c2
add braket for if statement
ltqin a6eca54
Merge branch 'develop' into ck_conv_fwd_bf16
ltqin d9a1f3d
rename fwd example name
ltqin 47fb3d8
Merge remote-tracking branch 'origin/develop' into ck_conv_fwd_bf16
206d5e5
remove StaticBufferOfVectorTypeV2
dfb8024
Merge remote-tracking branch 'origin/develop' into ck_conv_fwd_bf16
6af131f
tweak example
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
Large diffs are not rendered by default.
Oops, something went wrong.
109 changes: 109 additions & 0 deletions
109
device_operation/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| # Instructions for ```conv2d_fwd_xdl``` Example | ||
|
|
||
| ## Docker script | ||
| ```bash | ||
| docker run \ | ||
| -it \ | ||
| --rm \ | ||
| --privileged \ | ||
| --group-add sudo \ | ||
| -w /root/workspace \ | ||
| -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ | ||
| rocm/tensorflow:rocm4.3.1-tf2.6-dev \ | ||
| /bin/bash | ||
| ``` | ||
|
|
||
| ## Build ```conv2d_fwd_xdl``` | ||
| ```bash | ||
| mkdir build && cd build | ||
| ``` | ||
|
|
||
| ```bash | ||
| # Need to specify target ID, example below is gfx908 | ||
| cmake \ | ||
| -D BUILD_DEV=OFF \ | ||
| -D CMAKE_BUILD_TYPE=Release \ | ||
| -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \ | ||
| -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ | ||
| -D CMAKE_PREFIX_PATH=/opt/rocm \ | ||
| .. | ||
| ``` | ||
|
|
||
| ```bash | ||
| make -j conv2d_fwd_xdl | ||
| ``` | ||
|
|
||
| ## Run ```conv2d_fwd_xdl_int8``` | ||
| ```bash | ||
| #arg1: verification (0=no, 1=yes) | ||
| #arg2: initialization (0=no init, 1=integer value, 2=decimal value) | ||
| #arg3: run kernel # of times (>1) | ||
| #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx | ||
| ./example/conv2d_fwd_xdl_int8 0 1 5 | ||
| ``` | ||
|
|
||
| Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) | ||
| ``` | ||
| in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} | ||
| wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} | ||
| out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} | ||
| arg.a_grid_desc_k0_m_k1_{216, 165888, 8} | ||
| arg.b_grid_desc_k0_n_k1_{216, 256, 8} | ||
| arg.c_grid_desc_m_n_{ 165888, 256} | ||
| launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} | ||
| Warm up | ||
| Start running 5 times... | ||
| Perf: 1.43206 ms, 102.486 TFlops, 232.947 GB/s | ||
| ``` |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,280 @@ | ||
| #include <iostream> | ||
| #include <numeric> | ||
| #include <initializer_list> | ||
| #include <cstdlib> | ||
| #include <stdlib.h> | ||
| #include <half.hpp> | ||
| #include "config.hpp" | ||
| #include "print.hpp" | ||
| #include "device.hpp" | ||
| #include "host_tensor.hpp" | ||
| #include "host_tensor_generator.hpp" | ||
| #include "device_tensor.hpp" | ||
| #include "tensor_layout.hpp" | ||
| #include "device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp" | ||
| #include "element_wise_operation.hpp" | ||
|
|
||
| using InDataType = int8_t; | ||
| using WeiDataType = int8_t; | ||
| using OutDataType = int8_t; | ||
| using AccDataType = int32_t; | ||
|
|
||
| template <ck::index_t... Is> | ||
| using S = ck::Sequence<Is...>; | ||
|
|
||
| using InLayout = ck::tensor_layout::convolution::NHWC; | ||
| using WeiLayout = ck::tensor_layout::convolution::KYXC; | ||
| using OutLayout = ck::tensor_layout::convolution::NHWK; | ||
|
|
||
| using InElementOp = ck::tensor_operation::element_wise::PassThrough; | ||
| using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; | ||
| using OutElementOp = ck::tensor_operation::element_wise::PassThrough; | ||
|
|
||
| using PassThrough = ck::tensor_operation::element_wise::PassThrough; | ||
|
|
||
| static constexpr auto ConvFwdDefault = | ||
| ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default; | ||
|
|
||
| // clang-format off | ||
| using DeviceConvFwdInstance = ck::tensor_operation::device:: | ||
| DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t, int8_t, int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, 128, 128, 128, 4, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, true, 7, 1>; | ||
| // clang-format on | ||
|
|
||
| template <typename TIn, | ||
| typename TWei, | ||
| typename TOut, | ||
| typename InElementOp, | ||
| typename WeiElementOp, | ||
| typename OutElementOp> | ||
| void host_verify(const Tensor<TIn>& in, | ||
| const Tensor<TWei>& wei, | ||
| Tensor<TOut>& out, | ||
| const std::vector<ck::index_t>& conv_strides, | ||
| const std::vector<ck::index_t>& conv_dilations, | ||
| const std::vector<ck::index_t>& in_left_pads, | ||
| const std::vector<ck::index_t>&, | ||
| const InElementOp& in_element_op, | ||
| const WeiElementOp& wei_element_op, | ||
| const OutElementOp& out_element_op) | ||
| { | ||
| auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { | ||
| int32_t v = 0; | ||
| for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c) | ||
| { | ||
| for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y) | ||
| { | ||
| int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0]; | ||
| for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x) | ||
| { | ||
| int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1]; | ||
| if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && | ||
| wi < in.mDesc.GetLengths()[3]) | ||
| { | ||
| v += in_element_op(static_cast<const int32_t>(in(n, c, hi, wi))) * | ||
| wei_element_op(static_cast<const int32_t>(wei(k, c, y, x))); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| int32_t v2 = out(n, k, ho, wo); | ||
|
|
||
| out_element_op(v2, v); | ||
|
|
||
| out(n, k, ho, wo) = v2; | ||
| }; | ||
|
|
||
| make_ParallelTensorFunctor(f_nchw, | ||
| out.mDesc.GetLengths()[0], | ||
| out.mDesc.GetLengths()[1], | ||
| out.mDesc.GetLengths()[2], | ||
| out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); | ||
| } | ||
|
|
||
| int main(int argc, char* argv[]) | ||
| { | ||
| bool do_verification = 0; | ||
| int init_method = 0; | ||
| int nrepeat = 5; | ||
|
|
||
| // Conv shape | ||
| ck::index_t N = 128; | ||
| ck::index_t K = 256; | ||
| ck::index_t C = 192; | ||
| ck::index_t Y = 3; | ||
| ck::index_t X = 3; | ||
| ck::index_t Hi = 71; | ||
| ck::index_t Wi = 71; | ||
| ck::index_t conv_stride_h = 2; | ||
| ck::index_t conv_stride_w = 2; | ||
| ck::index_t conv_dilation_h = 1; | ||
| ck::index_t conv_dilation_w = 1; | ||
| ck::index_t in_left_pad_h = 1; | ||
| ck::index_t in_left_pad_w = 1; | ||
| ck::index_t in_right_pad_h = 1; | ||
| ck::index_t in_right_pad_w = 1; | ||
|
|
||
| if(argc == 4) | ||
| { | ||
| do_verification = std::stoi(argv[1]); | ||
| init_method = std::stoi(argv[2]); | ||
| nrepeat = std::stoi(argv[3]); | ||
| } | ||
| else if(argc == 19) | ||
| { | ||
| do_verification = std::stoi(argv[1]); | ||
| init_method = std::stoi(argv[2]); | ||
| nrepeat = std::stoi(argv[3]); | ||
|
|
||
| N = std::stoi(argv[4]); | ||
| K = std::stoi(argv[5]); | ||
| C = std::stoi(argv[6]); | ||
| Y = std::stoi(argv[7]); | ||
| X = std::stoi(argv[8]); | ||
| Hi = std::stoi(argv[9]); | ||
| Wi = std::stoi(argv[10]); | ||
| conv_stride_h = std::stoi(argv[11]); | ||
| conv_stride_w = std::stoi(argv[12]); | ||
| conv_dilation_h = std::stoi(argv[13]); | ||
| conv_dilation_w = std::stoi(argv[14]); | ||
| in_left_pad_h = std::stoi(argv[15]); | ||
| in_left_pad_w = std::stoi(argv[16]); | ||
| in_right_pad_h = std::stoi(argv[17]); | ||
| in_right_pad_w = std::stoi(argv[18]); | ||
| } | ||
| else | ||
| { | ||
| printf("arg1: verification (0=no, 1=yes)\n"); | ||
| printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); | ||
| printf("arg3: run kernel # of times (>1)\n"); | ||
| printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " | ||
| "RightPx\n"); | ||
| exit(0); | ||
| } | ||
|
|
||
| const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; | ||
| const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; | ||
|
|
||
| const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; | ||
| const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; | ||
|
|
||
| const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}}; | ||
| const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}}; | ||
| const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}}; | ||
| const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}}; | ||
|
|
||
| // tensor layout | ||
| auto f_host_tensor_descriptor = [](std::size_t N_, | ||
| std::size_t C_, | ||
| std::size_t H, | ||
| std::size_t W, | ||
| auto layout) { | ||
| if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || | ||
| ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || | ||
| ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) | ||
| { | ||
| return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), | ||
| std::vector<std::size_t>({C_ * H * W, H * W, W, 1})); | ||
| } | ||
| else if constexpr(ck::is_same<decltype(layout), | ||
| ck::tensor_layout::convolution::NHWC>::value || | ||
| ck::is_same<decltype(layout), | ||
| ck::tensor_layout::convolution::KYXC>::value || | ||
| ck::is_same<decltype(layout), | ||
| ck::tensor_layout::convolution::NHWK>::value) | ||
| { | ||
| return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), | ||
| std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_})); | ||
| } | ||
| }; | ||
|
|
||
| Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); | ||
| Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); | ||
| Tensor<OutDataType> out_n_k_ho_wo_host_result( | ||
| f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); | ||
| Tensor<OutDataType> out_n_k_ho_wo_device_result( | ||
| f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); | ||
|
|
||
| std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; | ||
| std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; | ||
| std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; | ||
|
|
||
| switch(init_method) | ||
| { | ||
| case 0: break; | ||
| case 1: | ||
| in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-1, 1}); | ||
| wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-1, 1}); | ||
| break; | ||
| default: | ||
| in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0, 1}); | ||
| wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-1, 1}); | ||
| } | ||
|
|
||
| DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); | ||
| DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); | ||
| DeviceMem out_device_buf(sizeof(OutDataType) * | ||
| out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); | ||
|
|
||
| in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); | ||
| wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); | ||
|
|
||
| // do GEMM | ||
| auto conv = DeviceConvFwdInstance{}; | ||
| auto invoker = conv.MakeInvoker(); | ||
| auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), | ||
| static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), | ||
| static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), | ||
| N, | ||
| K, | ||
| C, | ||
| std::vector<ck::index_t>{{Hi, Wi}}, | ||
| std::vector<ck::index_t>{{Y, X}}, | ||
| std::vector<ck::index_t>{{Ho, Wo}}, | ||
| conv_filter_strides, | ||
| conv_filter_dilations, | ||
| input_left_pads, | ||
| input_right_pads, | ||
| InElementOp{}, | ||
| WeiElementOp{}, | ||
| OutElementOp{}); | ||
|
|
||
| if(!conv.IsSupportedArgument(argument)) | ||
| { | ||
| throw std::runtime_error( | ||
| "wrong! device_conv with the specified compilation parameters does " | ||
| "not support this Conv problem"); | ||
| } | ||
|
|
||
| float ave_time = invoker.Run(argument, nrepeat); | ||
|
|
||
| std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; | ||
|
|
||
| std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + | ||
| sizeof(WeiDataType) * (K * C * Y * X) + | ||
| sizeof(OutDataType) * (N * K * Ho * Wo); | ||
|
|
||
| float tflops = static_cast<float>(flop) / 1.E9 / ave_time; | ||
|
|
||
| float gb_per_sec = num_btype / 1.E6 / ave_time; | ||
|
|
||
| std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" | ||
| << std::endl; | ||
|
|
||
| if(do_verification) | ||
| { | ||
| host_verify(in_n_c_hi_wi, | ||
| wei_k_c_y_x, | ||
| out_n_k_ho_wo_host_result, | ||
| conv_filter_strides, | ||
| conv_filter_dilations, | ||
| input_left_pads, | ||
| input_right_pads, | ||
| InElementOp{}, | ||
| WeiElementOp{}, | ||
| OutElementOp{}); | ||
|
|
||
| out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); | ||
|
|
||
| check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result); | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.