Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 30 additions & 19 deletions example/12_reduce/reduce_blockwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;

using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;

using DeviceReduceInstance = DeviceReduceMultiBlock<InDataType,
AccDataType,
Expand Down Expand Up @@ -247,6 +247,13 @@ int main(int argc, char* argv[])

DeviceMem out_index_dev(indicesSizeInBytes);

InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;

std::tie(in_elementwise_op, acc_elementwise_op) =
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length));

if(args.do_verification)
{
ReductionHost<InDataType,
Expand All @@ -261,8 +268,13 @@ int main(int argc, char* argv[])
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

hostReduce.Run(
alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
hostReduce.Run(alpha,
in.mData.data(),
beta,
out_ref.mData.data(),
out_indices_ref.mData.data(),
in_elementwise_op,
acc_elementwise_op);
};

std::vector<ck::index_t> i_inLengths;
Expand All @@ -277,20 +289,19 @@ int main(int argc, char* argv[])

auto reduce = DeviceReduceInstance{};

auto argument_ptr = reduce.MakeArgumentPointer(
i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
out_index_dev.GetDeviceBuffer(),
InElementwiseOperation{static_cast<int32_t>(reduce_total_length)},
AccElementwiseOperation{static_cast<int32_t>(reduce_total_length)});
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
reduceDims,
alpha,
beta,
in_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
out_index_dev.GetDeviceBuffer(),
in_elementwise_op,
acc_elementwise_op);

if(!reduce.IsSupportedArgument(argument_ptr.get()))
{
Expand Down
77 changes: 44 additions & 33 deletions example/12_reduce/reduce_blockwise_two_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;

using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
using InElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation =
typename reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;

using PassThroughOp = tensor_operation::element_wise::UnaryIdentic<AccDataType, AccDataType>;
using PassThroughOp = tensor_operation::element_wise::PassThrough;

using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
AccDataType,
Expand Down Expand Up @@ -184,6 +184,13 @@ int main(int argc, char* argv[])
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());

InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;

std::tie(in_elementwise_op, acc_elementwise_op) =
reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
static_cast<int32_t>(reduce_total_length));

if(do_verify)
{
ReductionHost<InOutDataType,
Expand All @@ -198,7 +205,13 @@ int main(int argc, char* argv[])
OutputIndex>
hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);

hostReduce.Run(alpha, in_1.mData.data(), beta, out_ref.mData.data(), nullptr);
hostReduce.Run(alpha,
in_1.mData.data(),
beta,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
};

std::vector<ck::index_t> i_inLengths_1;
Expand All @@ -217,20 +230,19 @@ int main(int argc, char* argv[])

auto reduce_1 = DeviceReduceInstance_1{};

auto argument_ptr_1 = reduce_1.MakeArgumentPointer(
i_inLengths_1,
i_inStrides_1,
i_inLengths_2,
i_inStrides_2,
reduceDims_1,
1.0f,
0.0f,
in_1_dev.GetDeviceBuffer(),
nullptr,
in_2_dev.GetDeviceBuffer(),
nullptr,
InElementwiseOperation{static_cast<int32_t>(reduce_total_length)},
PassThroughOp{});
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1,
i_inStrides_1,
i_inLengths_2,
i_inStrides_2,
reduceDims_1,
1.0f,
0.0f,
in_1_dev.GetDeviceBuffer(),
nullptr,
in_2_dev.GetDeviceBuffer(),
nullptr,
in_elementwise_op,
PassThroughOp{});

if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
{
Expand All @@ -243,20 +255,19 @@ int main(int argc, char* argv[])

auto reduce_2 = DeviceReduceInstance_2{};

auto argument_ptr_2 = reduce_2.MakeArgumentPointer(
i_inLengths_2,
i_inStrides_2,
i_outLengths,
i_outStrides,
reduceDims_2,
alpha,
beta,
in_2_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
nullptr,
PassThroughOp{},
AccElementwiseOperation{static_cast<int32_t>(reduce_total_length)});
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2,
i_inStrides_2,
i_outLengths,
i_outStrides,
reduceDims_2,
alpha,
beta,
in_2_dev.GetDeviceBuffer(),
nullptr,
out_dev.GetDeviceBuffer(),
nullptr,
PassThroughOp{},
acc_elementwise_op);

if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
{
Expand Down
19 changes: 9 additions & 10 deletions example/13_pool2d_fwd/pool2d_fwd_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,23 @@ static void pool_host_verify(const Tensor<InDataType>& in,
const std::array<ck::index_t, 2>& in_left_pads,
const std::array<ck::index_t, 2>& /*in_right_pads*/)
{
const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
const int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];

using ReduceOperation = typename ck::reduce_binary_operator<AccDataType, ReduceOpId>::opType;
using InElementwiseOperation = typename ck::
reduce_unary_operator<AccDataType, ReduceOpId, true, true>::InElementwiseOperation;
using AccElementwiseOperation = typename ck::
reduce_unary_operator<AccDataType, ReduceOpId, true, true>::AccElementwiseOperation;
using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;

const InElementwiseOperation in_elementwise_op(divider);
const AccElementwiseOperation acc_elementwise_op(divider);
auto elementwise_ops =
ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);

auto in_elementwise_op = std::get<0>(elementwise_ops);
auto acc_elementwise_op = std::get<1>(elementwise_ops);

if constexpr(!OutputIndex)
{
using Accumulation =
ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;

auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
auto accuVal = ReduceOperation::GetIdentityValue();
auto accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();

for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
{
Expand Down Expand Up @@ -86,7 +85,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
AccDataType,
IndexDataType>;
auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
auto accuVal = ReduceOperation::GetIdentityValue();
auto accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
IndexDataType accuIndex = 0;

for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
Expand Down
13 changes: 8 additions & 5 deletions example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using DsReduceOp = ck::Tuple<ck::reduce::Max<ReduceAccDataType>>;
using DsElementOp = ck::Tuple<
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>>;
using DsReduceOp = ck::Tuple<ck::reduce::Max>;
using DsElementOp = ck::Tuple<ck::tensor_operation::element_wise::PassThrough>;
using DGlobalMemOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;

Expand Down Expand Up @@ -236,10 +235,14 @@ int main(int argc, char* argv[])

for(int m = 0; m < M; ++m)
{
ReduceAccDataType d_acc = d_reduce_op.GetIdentityValue();
ReduceAccDataType d_acc = d_reduce_op.GetIdentityValue<ReduceAccDataType>();

for(int n = 0; n < N; ++n)
d_reduce_op(d_acc, c_m_n_host_result(m, n));
{
ReduceAccDataType curr_val =
ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
d_reduce_op(d_acc, curr_val);
};

d_m_host_result(m) = d_acc;
}
Expand Down
21 changes: 9 additions & 12 deletions example/16_gemm_reduce/gemm_reduce_xdl_mean_squaremean_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,15 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::reduce::Add<ReduceAccDataType>;
using D1ReduceOp = ck::reduce::Add<ReduceAccDataType>;
using D0ReduceOp = ck::reduce::Add;
using D1ReduceOp = ck::reduce::Add;
using DxsReduceOp = ck::Tuple<D0ReduceOp, D1ReduceOp>;

using UnaryIdenticElementOp =
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
using UnaryDivElementOp =
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, true>;
using UnarySquareElementOp =
ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;

using DGlobalMemOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
Expand Down Expand Up @@ -261,8 +258,8 @@ int main(int argc, char* argv[])

for(int m = 0; m < M; ++m)
{
float d0_acc = d0_reduce_op.GetIdentityValue();
float d1_acc = d1_reduce_op.GetIdentityValue();
float d0_acc = d0_reduce_op.GetIdentityValue<float>();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Originally using D0ReduceOp = ck::reduce::Add<ReduceAccDataType>;

So now it should be float d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this example need a hack with regard to the using of "AccDataType"

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected in ba0ce

float d1_acc = d1_reduce_op.GetIdentityValue<float>();
Comment thread
asroy marked this conversation as resolved.
Outdated

for(int n = 0; n < N; ++n)
{
Expand Down
18 changes: 8 additions & 10 deletions example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,14 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::reduce::Add<ReduceAccDataType>;
using D1ReduceOp = ck::reduce::Add<ReduceAccDataType>;
using D0ReduceOp = ck::reduce::Add;
using D1ReduceOp = ck::reduce::Add;
using DxsReduceOp = ck::Tuple<D0ReduceOp, D1ReduceOp>;

using UnaryIdenticElementOp =
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
using UnarySquareElementOp =
ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;

using DGlobalMemOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
Expand Down Expand Up @@ -259,8 +257,8 @@ int main(int argc, char* argv[])
{
for(int m = 0; m < M; ++m)
{
float d0_acc = d0_reduce_op.GetIdentityValue();
float d1_acc = d1_reduce_op.GetIdentityValue();
float d0_acc = d0_reduce_op.GetIdentityValue<float>();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected in ba0ce

float d1_acc = d1_reduce_op.GetIdentityValue<float>();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float d1_acc = d1_reduce_op.GetIdentityValue<ReduceAccDataType>();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected in ba0ce


for(int n = 0; n < N; ++n)
{
Expand Down
19 changes: 8 additions & 11 deletions example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,14 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using ReduceSumOp = ck::reduce::Add<ReduceAccDataType>;
using ReduceSumOp = ck::reduce::Add;
using DxsReduceOp = ck::Tuple<ReduceSumOp, ReduceSumOp>;

using UnaryIdenticElementOp =
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
using UnaryDivElementOp =
ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, true>;
using UnarySquareElementOp =
ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOp = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOp = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;

using DxsGlobalMemOp =
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
Expand Down Expand Up @@ -157,8 +154,8 @@ void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
auto reduceSumOpInst = ReduceSumOp{};
for(int m = 0; m < M; ++m)
{
float mean_acc = reduceSumOpInst.GetIdentityValue();
float square_mean_acc = reduceSumOpInst.GetIdentityValue();
float mean_acc = reduceSumOpInst.GetIdentityValue<float>();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float mean_acc = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected in ba0ce

float square_mean_acc = reduceSumOpInst.GetIdentityValue<float>();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float square_mean_acc = reduceSumOpInst.GetIdentityValue<ReduceAccDataType>();

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected in ba0ce


for(int n = 0; n < N; ++n)
{
Expand Down
Loading