Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
61054dd
Use thread cluster descriptor and explicit M_K 2d descriptor to simpl…
qianfengz Mar 4, 2022
106951c
Change by replacing ReduceDims by NumReduceDims as Device Reduce inte…
qianfengz Mar 6, 2022
896e2af
Rename the folder name for the pool2d and reduce examples
qianfengz Mar 7, 2022
7fea393
Update to reduction test scripts
qianfengz Mar 7, 2022
e27fc75
Add Readme for pool2d_fwd and reduce_blockwise examples
qianfengz Mar 7, 2022
6b91757
Add support for int8_t reduction (ADD/AVG, MIN/MAX/AMAX)
qianfengz Mar 7, 2022
a2fbd87
Tiny fix in reduce profiler and tiny update in reduce testing scripts
qianfengz Mar 7, 2022
0e197b8
Merge branch 'pr82-followup' into ck_reduce_int8_bp16
qianfengz Mar 7, 2022
5881bf8
Tiny fix in testing script profile_reduce_no_index.sh
qianfengz Mar 8, 2022
5357c36
Tiny fix in testing script profile_reduce_no_index.sh
qianfengz Mar 8, 2022
7398cef
Merge branch 'develop' into pr82-followup
qianfengz Mar 9, 2022
d2ec785
Add support for bfp16 reduction (using bhalf_t = ushort)
qianfengz Mar 9, 2022
55ff757
Tiny fix in amd_buffer_addressing.hpp
qianfengz Mar 9, 2022
ab45ae0
Tiny change in script/profile_reduce_with_index.sh
qianfengz Mar 9, 2022
f95b23c
Use AccDataType for Beta value and use element_wise::PassThrough
qianfengz Mar 10, 2022
1600461
Use type_convert for type converting in host layer reduction
qianfengz Mar 10, 2022
9327afb
Renaming and refining in Reduction profiler/device layer/examples
qianfengz Mar 10, 2022
a29ccd5
Renaming and refining in Reduction profiler/device layer/examples
qianfengz Mar 10, 2022
c6e55e8
Renaming all NumReduceDims to NumReduceDim
qianfengz Mar 10, 2022
c5d051d
Fix the leaked type_convert in ThreadwiseTensorSliceTransfer_v2
qianfengz Mar 10, 2022
5801348
Update to testing scripts to add bf16 support
qianfengz Mar 10, 2022
aec51ed
Align the files for int8/bfloat16 with the re-organized directory tree
qianfengz Mar 10, 2022
5fd206d
Merge branch 'develop' into ck_reduce_int8_bp16
qianfengz Mar 11, 2022
50fc7dd
added more static_assert
j4yan Mar 11, 2022
6a0afa5
Remove buggy tunable configurations defined in device_reduce_instance…
qianfengz Mar 11, 2022
48a931d
Add static_assert to give compile-time warning for incorrect thread s…
qianfengz Mar 11, 2022
43c8b6d
minor change
j4yan Mar 11, 2022
60a65c1
Refine and fix (in GetWorkspaceSizeInBytes of MultiBlockPartialReduce…
qianfengz Mar 13, 2022
f15e568
Tiny renaming in gridwise_2d_reduction_multiblock_partial_reduce.hpp
qianfengz Mar 13, 2022
ad71fa5
Tiny fix in script/profile_reduce_no_index.sh
qianfengz Mar 13, 2022
fd72e6e
Refine in DeviceReduce layer with regard to using NumInvariantDim/Num…
qianfengz Mar 13, 2022
1763be6
Generic renaming in host reduction and DeviceReduce layer
qianfengz Mar 13, 2022
959dc4c
Add support for 4-d all dimension reduction in the profiler and add_d…
qianfengz Mar 13, 2022
d86e66f
Merge branch 'develop' into ck_reduce_int8_bp16
qianfengz Mar 13, 2022
47e41fb
Merge branch 'add_more_static_assert_to_threadwise_copy' into ck_redu…
qianfengz Mar 15, 2022
bccfe3b
Use multi-thread and simplification for host Reduction implementation
qianfengz Mar 19, 2022
f41118e
Add ctest for reduction
qianfengz Mar 21, 2022
12647ee
Update to clarify the using of data init method in produce_reduce/exa…
qianfengz Mar 21, 2022
987668c
Update to the reduce CTest executables to enable default testing beha…
qianfengz Mar 21, 2022
3306ac4
Merge branch 'develop' into ck_reduce_int8_bp16
qianfengz Mar 22, 2022
47504ea
Renaming
qianfengz Mar 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion example/12_reduce/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ cmake \
```bash
# -D <xxx> : input 4-d tensor lengths
# -v <x> : verification (0=no, 1=yes)
#arg1: initialization (0=no init, 1=integer value, 2=decimal value)
#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg2: run kernel # of times (>1)
./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
```
Expand Down
55 changes: 35 additions & 20 deletions example/12_reduce/reduce_blockwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
#include "device_base.hpp"
#include "device_reduce_blockwise.hpp"
#include "host_reduce_util.hpp"
#include "host_generic_reduction.hpp"
#include "host_reduction.hpp"

#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"

using namespace ck;
using namespace ck::tensor_operation::device;

using InDataType = half_float::half;
using OutDataType = half_float::half;
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;

using kInDataType = ck::half_t;
using kOutDataType = ck::half_t;
using kAccDataType = float;
using HostInDataType = half_float::half;
using HostOutDataType = half_float::half;
using HostAccDataType = float;

constexpr int Rank = 4;
constexpr int NumReduceDim = 3;
Expand All @@ -43,9 +43,9 @@ using InElementwiseOperation =
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;

using DeviceReduceInstance = DeviceReduceBlockWise<kInDataType,
kAccDataType,
kOutDataType,
using DeviceReduceInstance = DeviceReduceBlockWise<InDataType,
AccDataType,
OutDataType,
Rank,
NumReduceDim,
ReduceOperation,
Expand Down Expand Up @@ -135,6 +135,10 @@ class SimpleAppArgs
std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
"comparing with the host-based reduction"
<< std::endl;
std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
"value, 3=decimal value)"
<< std::endl;
std::cout << "Arg2 -- number of repeats to run the kernel" << std::endl;
};

int processArgs(int argc, char* argv[])
Expand Down Expand Up @@ -263,20 +267,21 @@ int main(int argc, char* argv[])
{
switch(args.init_method)
{
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
case 0: break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{}, num_thread);
out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
break;
case 1:
case 2:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
if(beta != 0.0f)
out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{1, 5}, num_thread);
out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
}

if(beta != 0.0f)
Expand All @@ -293,17 +298,27 @@ int main(int argc, char* argv[])
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());

size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int) : 0;
size_t indicesSizeInBytes = NeedIndices ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;

DeviceMem out_indices_dev(indicesSizeInBytes);

if(args.do_verification)
{
ReductionHost<InDataType, AccDataType, OutDataType, ReduceOpId, PropagateNan, NeedIndices>
ReductionHost<HostInDataType,
HostAccDataType,
HostOutDataType,
ReduceOpId,
Rank,
NumReduceDim,
PropagateNan,
NeedIndices>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
hostReduce.Run(alpha,
reinterpret_cast<const HostInDataType*>(in.mData.data()),
beta,
reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
out_indices_ref.mData.data());
};

const auto i_inLengths = to_int_vector(args.inLengths);
Expand All @@ -313,7 +328,7 @@ int main(int argc, char* argv[])

auto reduce = DeviceReduceInstance{};

auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths);
auto wsSizeInBytes = reduce.GetWorkspaceSizeInBytes(i_inLengths, reduceDims);

DeviceMem ws_dev(wsSizeInBytes);

Expand Down
2 changes: 1 addition & 1 deletion example/13_pool2d_fwd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ cmake \
## Run ```pool2d_fwd```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: run kernel # of times (>1)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./example/pool2d_fwd 1 1 10
Expand Down
5 changes: 3 additions & 2 deletions example/13_pool2d_fwd/pool2d_fwd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,9 @@ int main(int argc, char* argv[])
switch(init_method)
{
case 0: break;
case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
}

DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
Expand Down
18 changes: 10 additions & 8 deletions include/ck/tensor_operation/gpu/device/device_reduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@ namespace device {
template <typename InElementwiseOperation, typename AccElementwiseOperation>
struct DeviceReduce : public BaseOperator
{
virtual size_t GetWorkspaceSizeInBytes(const std::vector<int>& inLengths)
virtual long_index_t GetWorkspaceSizeInBytes(const std::vector<int> inLengths,
const std::vector<int> reduceDims)
{
(void)inLengths;
(void)reduceDims;

return (0);
};
Expand All @@ -32,19 +34,19 @@ struct DeviceReduce : public BaseOperator
};

virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<int>& inLengths,
const std::vector<int>& inStrides,
const std::vector<int>& outLengths,
const std::vector<int>& outStrides,
const std::vector<int>& reduceDims,
MakeArgumentPointer(const std::vector<int> inLengths,
const std::vector<int> inStrides,
const std::vector<int> outLengths,
const std::vector<int> outStrides,
const std::vector<int> reduceDims,
float alpha,
float beta,
const void* in_dev,
void* out_dev,
void* out_indices_dev,
void* workspace_dev,
const InElementwiseOperation& in_elementwise_op,
const AccElementwiseOperation& acc_elementwise_op) = 0;
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op) = 0;

virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
};
Expand Down
Loading