Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
463c347
Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for do…
qianfengz May 16, 2022
ebc2afb
Update to host layer and host reduction
qianfengz May 16, 2022
3fb2acd
Merge and remove reduction kernels
qianfengz May 16, 2022
d841314
Merge and remove reduction device interfaces and update pooling devic…
qianfengz May 16, 2022
d7baf1a
Merge and remove useless reduction device instances
qianfengz May 16, 2022
6c0f5de
Update to reduction profiler and reduction ctests
qianfengz May 16, 2022
329e4d4
Update to reduction and pooling examples and add one reduction example
qianfengz May 16, 2022
f094490
Change to reduction examples to let them testable by ctest
qianfengz May 17, 2022
e41a98a
Add explicit pass checking for reduction and pooling examples
qianfengz May 18, 2022
1d1435b
Explicit assignment of tensor shapes in example reduce_blockwise_two_…
qianfengz May 19, 2022
41673f8
Use atomic_add to repace atomicAdd and add atomic_add for double type
qianfengz May 19, 2022
033e2a6
Add reduce ctest support for double data type
qianfengz May 19, 2022
6095712
Replace to_int_vector() by using c++ std::vector::assign()
qianfengz May 19, 2022
d5af70d
Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise
qianfengz May 20, 2022
11a087e
Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into …
qianfengz May 21, 2022
75e3ef6
Merge branch 'develop' into reduce_overhaul_pr
qianfengz May 21, 2022
f5fb1d6
Add GetAtomicOperationZeroValue() support for AtomicMax
qianfengz May 22, 2022
7956bed
Add example for computing LayerNorm mean and meansquare
May 22, 2022
d70bef0
Refactor the pool2d_fwd example and add example for float type testing
qianfengz May 23, 2022
b52b65a
Tiny change to reduce example README.md
qianfengz May 23, 2022
bda9440
Merge branch 'reduce_overhaul_pr' into lnorm_example
qianfengz May 23, 2022
ac90c36
Merge branch 'develop' into lnorm_example
qianfengz May 25, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion example/13_pool2d_fwd/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
add_example_executable(example_pool2d_fwd pool2d_fwd.cpp)
add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)

27 changes: 24 additions & 3 deletions example/13_pool2d_fwd/README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Instructions for ```example_pool2d_fwd``` Example
# Instructions for ```example_pool2d_fwd``` Examples

## Run ```example_pool2d_fwd```
## Run ```example_pool2d_fwd_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd 1 1 1
./bin/example_pool2d_fwd_fp16 1 1 1
```

Result
Expand All @@ -18,3 +18,24 @@ Warm up 1 time
Start running 10 times...
Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
```

## Run ```example_pool2d_fwd_fp32```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd_fp32 1 1 1
```


Result
```
./bin/example_pool2d_fwd_fp32 1 1 1
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
```
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>

#include "check_err.hpp"
#include "config.hpp"
Expand All @@ -13,44 +9,13 @@
#include "host_reduce_util.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"

using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;

using IndexDataType = int32_t;

using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;

#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif

static constexpr bool OutputIndex = false;
static constexpr bool PropagateNan = false;

using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
InDataType, // InDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
ReduceOpId,
OutputIndex,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
4>; // InSrcOutDstVectorSize

template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool OutputIndex>
Expand Down Expand Up @@ -147,68 +112,46 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
}

int main(int argc, char* argv[])
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InLayout,
typename OutLayout,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool OutputIndex>
bool pool_test(bool do_verification,
int init_method,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Y,
ck::index_t X,
ck::index_t Hi,
ck::index_t Wi,
ck::index_t window_stride_h,
ck::index_t window_stride_w,
ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w,
ck::index_t in_right_pad_h,
ck::index_t in_right_pad_w)
{
using namespace ck::host_reduce;

bool do_verification;
int init_method;
bool time_kernel;

// Pool shape
ck::index_t N = 128;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;

if(argc == 1)
{
do_verification = true;
init_method = 1;
time_kernel = true;
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));

N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
window_stride_h = std::stoi(argv[10]);
window_stride_w = std::stoi(argv[11]);
in_left_pad_h = std::stoi(argv[12]);
in_left_pad_w = std::stoi(argv[13]);
in_right_pad_h = std::stoi(argv[14]);
in_right_pad_w = std::stoi(argv[15]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
InDataType, // InDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
ReduceOpId,
OutputIndex,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
4>; // InSrcOutDstVectorSize

const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
Expand Down Expand Up @@ -302,6 +245,7 @@ int main(int argc, char* argv[])
pool_host_verify<InDataType,
OutDataType,
AccDataType,
IndexDataType,
ReduceOpId,
PropagateNan,
OutputIndex>(in_n_c_hi_wi,
Expand All @@ -325,5 +269,5 @@ int main(int argc, char* argv[])
};
}

return (pass ? 0 : 1);
}
return (pass);
};
116 changes: 116 additions & 0 deletions example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#include <iostream>
#include <cstdlib>

#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"

#include "pool2d_fwd_common.hpp"

using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;

using IndexDataType = int32_t;

using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;

#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif

static constexpr bool OutputIndex = false;
static constexpr bool PropagateNan = false;

int main(int argc, char* argv[])
{
using namespace ck::host_reduce;

bool do_verification;
int init_method;
bool time_kernel;

// Pool shape
ck::index_t N = 128;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;

if(argc == 1)
{
do_verification = true;
init_method = 1;
time_kernel = true;
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));

N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
window_stride_h = std::stoi(argv[10]);
window_stride_w = std::stoi(argv[11]);
in_left_pad_h = std::stoi(argv[12]);
in_left_pad_w = std::stoi(argv[13]);
in_right_pad_h = std::stoi(argv[14]);
in_right_pad_w = std::stoi(argv[15]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}

bool pass = pool_test<InDataType,
OutDataType,
AccDataType,
IndexDataType,
InLayout,
OutLayout,
ReduceOpId,
PropagateNan,
OutputIndex>(do_verification,
init_method,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);

return (pass ? 0 : 1);
}
Loading