Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
463c347
Tiny fix in dynamic_buffer.hpp to support vectorized AtomicAdd for do…
qianfengz May 16, 2022
ebc2afb
Update to host layer and host reduction
qianfengz May 16, 2022
3fb2acd
Merge and remove reduction kernels
qianfengz May 16, 2022
d841314
Merge and remove reduction device interfaces and update pooling devic…
qianfengz May 16, 2022
d7baf1a
Merge and remove useless reduction device instances
qianfengz May 16, 2022
6c0f5de
Update to reduction profiler and reduction ctests
qianfengz May 16, 2022
329e4d4
Update to reduction and pooling examples and add one reduction example
qianfengz May 16, 2022
f094490
Change to reduction examples to let them testable by ctest
qianfengz May 17, 2022
e41a98a
Add explicit pass checking for reduction and pooling examples
qianfengz May 18, 2022
1d1435b
Explicit assignment of tensor shapes in example reduce_blockwise_two_…
qianfengz May 19, 2022
41673f8
Use atomic_add to repace atomicAdd and add atomic_add for double type
qianfengz May 19, 2022
033e2a6
Add reduce ctest support for double data type
qianfengz May 19, 2022
6095712
Replace to_int_vector() by using c++ std::vector::assign()
qianfengz May 19, 2022
d5af70d
Keep DeviceReduceThreadWise separated from DeviceReduceBlockWise
qianfengz May 20, 2022
11a087e
Merge DeviceReduceBlockWise and DeviceReduceMultiBlockAtomicAdd into …
qianfengz May 21, 2022
75e3ef6
Merge branch 'develop' into reduce_overhaul_pr
qianfengz May 21, 2022
f5fb1d6
Add GetAtomicOperationZeroValue() support for AtomicMax
qianfengz May 22, 2022
b52b65a
Tiny change to reduce example README.md
qianfengz May 23, 2022
e0af138
Merge branch 'develop' into reduce_overhaul_pr
qianfengz May 24, 2022
616e0a3
Fix some tiny issues due to branch merging
qianfengz May 24, 2022
a0df2e7
Revoke previous change in dynamic_buffer.hpp and add atomic_add for d…
qianfengz May 24, 2022
8a604a4
Add reduce multiblock_atomic_add instances for fp64 to verify vectori…
qianfengz May 24, 2022
c43158e
Renaming
qianfengz May 24, 2022
a57cec4
Clean the header includings in device_reduce instances header files
qianfengz May 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion example/12_reduce/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp -D 16,64,32,960 -v 1 1 10)
add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
Comment thread
rosenrodt marked this conversation as resolved.
add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
41 changes: 28 additions & 13 deletions example/12_reduce/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,38 @@
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: run kernel # of times (>1)
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
#arg2: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
```

Result
```
./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 1
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Start running 3 times...
Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 529, 529
root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
Warm up
Warm up 1 time
Start running 10 times...
Perf: 0.282592 ms, 222.641 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
```

# Instructions for ```example_reduce_blockwise_two_call```

## Run ```example_reduce_blockwise_two_call```
```bash
#arg1: verification (0=no, 1=yes(
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
./bin/example_reduce_blockwise_two_call 1 2 1


Result
```
./bin/example_reduce_blockwise_two_call 1 2 1
launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 0.23392 ms, 268.966 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
error: 0
max_diff: 0, 528, 528
Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
```

188 changes: 71 additions & 117 deletions example/12_reduce/reduce_blockwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_base.hpp"
#include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp"
#include "device_reduce_multiblock.hpp"
#include "host_common_util.hpp"
#include "host_reduction.hpp"

#include "reduction_enums.hpp"
Expand All @@ -30,120 +30,78 @@ constexpr int Rank = 4;
constexpr int NumReduceDim = 3;

constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr NanPropagation NanOpt = NanPropagation::PROPAGATE_NAN;
constexpr bool PropagateNan = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;

using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;

using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
PropagateNan,
false,
256,
4,
64,
1,
1,
0,
1,
1>;
using DeviceReduceInstance = DeviceReduceMultiBlock<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOperation,
InElementwiseOperation,
AccElementwiseOperation,
InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
256,
4,
64,
1,
1,
0,
1,
1>;

static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
{"scales", required_argument, nullptr, 'S'},
{"verify", required_argument, nullptr, 'v'},
{"help", no_argument, nullptr, '?'},
{nullptr, 0, nullptr, 0}};

class SimpleAppArgs
{
template <typename T>
static T getSingleValueFromString(const std::string& valueStr)
{
std::istringstream iss(valueStr);

T ret;

iss >> ret;

return (ret);
};

template <typename T>
static std::vector<T> getTypeValuesFromString(const char* cstr_values)
{
std::string valuesStr(cstr_values);

std::vector<T> values;
std::size_t pos = 0;
std::size_t new_pos;

new_pos = valuesStr.find(',', pos);
while(new_pos != std::string::npos)
{
const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);

T val = getSingleValueFromString<T>(sliceStr);

values.push_back(val);

pos = new_pos + 1;
new_pos = valuesStr.find(',', pos);
};

std::string sliceStr = valuesStr.substr(pos);
T val = getSingleValueFromString<T>(sliceStr);

values.push_back(val);

return (values);
};

private:
int option_index = 0;

public:
std::vector<size_t> inLengths;
std::vector<float> scales;
std::vector<size_t> inLengths = {16, 64, 32, 960};
std::vector<float> scales = {1.0f, 0.0f};

bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
bool time_kernel = true;

public:
void show_usage(const char* cmd)
{
std::cout << "Usage of " << cmd << std::endl;
std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
<< std::endl;
std::cout << "--scales or -S, comma separated two float values for alpha and beta"
<< std::endl;
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<< std::endl;
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
std::cout << "Arg2 -- time kernel (0=n0, 1=yes)" << std::endl;
std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
};

int processArgs(int argc, char* argv[])
{
using ck::host_common::getTypeValuesFromString;

int ch;

while(1)
{
ch = getopt_long(argc, argv, "D:S:v:l:", long_options, &option_index);
ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
if(ch == -1)
break;
switch(ch)
Expand All @@ -154,12 +112,6 @@ class SimpleAppArgs

inLengths = getTypeValuesFromString<size_t>(optarg);
break;
case 'S':
if(!optarg)
throw std::runtime_error("Invalid option format!");

scales = getTypeValuesFromString<float>(optarg);
break;
case 'v':
if(!optarg)
throw std::runtime_error("Invalid option format!");
Expand All @@ -181,7 +133,7 @@ class SimpleAppArgs
throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");

init_method = std::atoi(argv[optind++]);
time_kernel = std::atoi(argv[optind]);
time_kernel = static_cast<bool>(std::atoi(argv[optind]));

if(scales.empty())
{
Expand All @@ -202,16 +154,16 @@ int main(int argc, char* argv[])

SimpleAppArgs args;

if(args.processArgs(argc, argv) < 0)
return (-1);
if(argc > 1)
{
if(args.processArgs(argc, argv) < 0)
return (-1);
};

constexpr bool op_support_indices =
(ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
ReduceOpId == ReduceTensorOp::AMAX);

constexpr bool NeedIndices =
(op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));

// if input is half type, no reason to use float for indiced reduction operation and must use
// float for non-indiced reduction operation for accuracy
constexpr bool invalid_reduce_1 =
Expand All @@ -225,8 +177,7 @@ int main(int argc, char* argv[])
(op_support_indices && !std::is_same<AccDataType, float>::value);

// indices option can only be used when it is really needed
constexpr bool invalid_reduce_3 =
(!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
constexpr bool invalid_reduce_3 = (!op_support_indices && OutputIndex);

constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);

Expand Down Expand Up @@ -294,9 +245,9 @@ int main(int argc, char* argv[])
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());

size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;

DeviceMem out_indices_dev(indicesSizeInBytes);
DeviceMem out_index_dev(indicesSizeInBytes);

if(args.do_verification)
{
Expand All @@ -307,38 +258,39 @@ int main(int argc, char* argv[])
Rank,
NumReduceDim,
PropagateNan,
NeedIndices>
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
};

const auto i_inLengths = to_int_vector(args.inLengths);
const auto i_inStrides = to_int_vector(inStrides);
const auto i_outLengths = to_int_vector(outLengths);
const auto i_outStrides = to_int_vector(outStrides);
std::vector<ck::index_t> i_inLengths;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;

i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());

auto reduce = DeviceReduceInstance{};

auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);

DeviceMem ws_dev(wsSizeInBytes);

auto argument_ptr =
reduce.MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer(),
out_indices_dev.GetDeviceBuffer(),
ws_dev.GetDeviceBuffer(),
InElementwiseOperation{static_cast<int>(reduce_total_length)},
AccElementwiseOperation{static_cast<int>(reduce_total_length)});
auto argument_ptr = reduce.MakeArgumentPointer(
i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
out_index_dev.GetDeviceBuffer(),
InElementwiseOperation{static_cast<int32_t>(reduce_total_length)},
AccElementwiseOperation{static_cast<int32_t>(reduce_total_length)});

if(!reduce.IsSupportedArgument(argument_ptr.get()))
{
Expand All @@ -362,16 +314,18 @@ int main(int argc, char* argv[])
<< std::endl;

bool pass = true;

if(args.do_verification)
{
out_dev.FromDevice(out.mData.data());
pass &= ck::utils::check_err(out.mData, out_ref.mData);
pass = pass && ck::utils::check_err(out.mData, out_ref.mData);

if(NeedIndices)
if(OutputIndex)
{
out_indices_dev.FromDevice(out_indices.mData.data());
pass &= ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
out_index_dev.FromDevice(out_indices.mData.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
};
};
return pass ? 0 : 1;

return (pass ? 0 : 1);
}
Loading