-
Notifications
You must be signed in to change notification settings - Fork 294
elementwise op #238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
elementwise op #238
Changes from all commits
a61f34f
c262612
b456d5e
0d26477
4af77e1
492da45
ecdfe96
0f84025
06e52d9
7d44e78
b7a82d2
83f7531
c4d610b
091eb1f
d7dca21
14dc827
c291be4
b2c1698
467bfbe
a36fa10
389cab4
8bbb4f8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| add_example_executable(example_broadcast_add_2d broadcast_add_2d.cpp) | ||
| add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp) | ||
| add_example_executable(example_elementwise_add_4d elementwise_add_4d.cpp) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,132 @@ | ||
| #include <iostream> | ||
| #include <cstdlib> | ||
| #include "check_err.hpp" | ||
| #include "config.hpp" | ||
| #include "device.hpp" | ||
| #include "host_tensor.hpp" | ||
| #include "host_tensor_generator.hpp" | ||
|
|
||
| #include "device_tensor.hpp" | ||
| #include "binary_element_wise_operation.hpp" | ||
| #include "device_binary_elementwise.hpp" | ||
|
|
||
| using F16 = ck::half_t; | ||
| using F32 = float; | ||
|
|
||
| using ABDataType = F16; | ||
| using CDataType = F16; | ||
| using EltwiseComputeDataType = F32; | ||
|
|
||
| using Add = ck::tensor_operation::binary_element_wise::Add; | ||
|
|
||
| using DeviceElementwiseAddInstance = ck::tensor_operation::device:: | ||
| DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 2, 8>; | ||
|
|
||
| template <typename HostTensorA, | ||
| typename HostTensorB, | ||
| typename HostTensorC, | ||
| typename ComputeDataType, | ||
| typename Functor, | ||
| int broadcastDim> | ||
| void host_broadcast2D( | ||
| HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, int N, Functor functor) | ||
| { | ||
| using ctype = ck::remove_reference_t<decltype(C(0, 0))>; | ||
|
|
||
| for(int m = 0; m < M; ++m) | ||
| { | ||
| for(int n = 0; n < N; ++n) | ||
| { | ||
| ComputeDataType Amn = static_cast<ComputeDataType>(A(m, n)); | ||
| ComputeDataType Cmn = 0; | ||
| if constexpr(broadcastDim == 0) | ||
| { | ||
| ComputeDataType Bn = static_cast<ComputeDataType>(B(n)); | ||
| functor(Cmn, Amn, Bn); | ||
| } | ||
| else | ||
| { | ||
| ComputeDataType Bm = static_cast<ComputeDataType>(B(m)); | ||
| functor(Cmn, Amn, Bm); | ||
| } | ||
| C(m, n) = static_cast<ctype>(Cmn); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| int main() | ||
| { | ||
| bool do_verification = true; | ||
| bool time_kernel = false; | ||
|
|
||
| ck::index_t M = 1024; | ||
| ck::index_t N = 1024; | ||
| ck::index_t Stride = 1024; | ||
|
|
||
| auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { | ||
| return HostTensorDescriptor(std::vector<std::size_t>({len}), | ||
| std::vector<std::size_t>({stride})); | ||
| }; | ||
|
|
||
| auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { | ||
| return HostTensorDescriptor(std::vector<std::size_t>({row, col}), | ||
| std::vector<std::size_t>({stride, 1})); | ||
| }; | ||
|
|
||
| Tensor<ABDataType> a_m_n(f_host_tensor_descriptor2d(M, N, Stride)); | ||
|
|
||
| Tensor<ABDataType> b_n(f_host_tensor_descriptor1d(N, 1)); | ||
|
|
||
| Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, Stride)); | ||
|
|
||
| a_m_n.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
| b_n.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
|
|
||
| DeviceMem a_m_n_device_buf(sizeof(ABDataType) * a_m_n.mDesc.GetElementSpace()); | ||
| DeviceMem b_n_device_buf(sizeof(ABDataType) * b_n.mDesc.GetElementSpace()); | ||
| DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpace()); | ||
|
|
||
| a_m_n_device_buf.ToDevice(a_m_n.mData.data()); | ||
| b_n_device_buf.ToDevice(b_n.mData.data()); | ||
|
|
||
| auto broadcastAdd = DeviceElementwiseAddInstance{}; | ||
| auto argument = broadcastAdd.MakeArgumentPointer(a_m_n_device_buf.GetDeviceBuffer(), | ||
| b_n_device_buf.GetDeviceBuffer(), | ||
| c_m_n_device_buf.GetDeviceBuffer(), | ||
| {M, N}, | ||
| {Stride, 1}, | ||
| {0, 1}, // broadcast in first dimension | ||
| {Stride, 1}, | ||
| Add{}); | ||
|
|
||
| if(!broadcastAdd.IsSupportedArgument(argument.get())) | ||
| { | ||
| throw std::runtime_error("The runtime parameters seems not supported by the " | ||
| "DeviceBinaryElementwise_2D instance, exiting!"); | ||
| }; | ||
|
|
||
| auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); | ||
| float ave_time = | ||
| broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); | ||
|
|
||
| std::cout << "Perf: " << ave_time << " ms" << std::endl; | ||
|
|
||
| bool pass = true; | ||
| if(do_verification) | ||
| { | ||
| c_m_n_device_buf.FromDevice(c_m_n.mData.data()); | ||
| Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride)); | ||
|
|
||
| host_broadcast2D<Tensor<ABDataType>, | ||
| Tensor<ABDataType>, | ||
| Tensor<CDataType>, | ||
| EltwiseComputeDataType, | ||
| Add, | ||
| 0>(host_c_m_n, a_m_n, b_n, M, N, Add{}); | ||
|
|
||
| pass &= ck::utils::check_err( | ||
| c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results d1", 1e-3, 1e-3); | ||
| } | ||
|
|
||
| return pass ? 0 : 1; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,110 @@ | ||
| #include <iostream> | ||
| #include <cstdlib> | ||
| #include "check_err.hpp" | ||
| #include "config.hpp" | ||
| #include "device.hpp" | ||
| #include "host_tensor.hpp" | ||
| #include "host_tensor_generator.hpp" | ||
|
|
||
| #include "device_tensor.hpp" | ||
| #include "binary_element_wise_operation.hpp" | ||
| #include "device_binary_elementwise.hpp" | ||
|
|
||
| using F16 = ck::half_t; | ||
| using F32 = float; | ||
|
|
||
| using ABDataType = F16; | ||
| using CDataType = F16; | ||
| using EltwiseComputeDataType = F32; | ||
|
|
||
| using Add = ck::tensor_operation::binary_element_wise::Add; | ||
|
|
||
| using DeviceElementwiseAddInstance = ck::tensor_operation::device:: | ||
| DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 1, 8>; | ||
|
|
||
| template <typename HostTensorA, | ||
| typename HostTensorB, | ||
| typename HostTensorC, | ||
| typename ComputeDataType, | ||
| typename Functor> | ||
| void host_elementwise1D( | ||
| HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor) | ||
| { | ||
| using ctype = ck::remove_reference_t<decltype(C(0))>; | ||
|
|
||
| for(int m = 0; m < M; ++m) | ||
| { | ||
| ComputeDataType Am = static_cast<ComputeDataType>(A(m)); | ||
| ComputeDataType Bm = static_cast<ComputeDataType>(B(m)); | ||
| ComputeDataType Cm = 0; | ||
| functor(Cm, Am, Bm); | ||
| C(m) = static_cast<ctype>(Cm); | ||
| } | ||
| } | ||
|
|
||
| int main() | ||
| { | ||
| bool do_verification = true; | ||
| bool time_kernel = false; | ||
|
|
||
| ck::index_t M = 1024; | ||
|
|
||
| auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { | ||
| return HostTensorDescriptor(std::vector<std::size_t>({len}), | ||
| std::vector<std::size_t>({stride})); | ||
| }; | ||
|
|
||
| Tensor<ABDataType> a_m(f_host_tensor_descriptor1d(M, 1)); | ||
| Tensor<ABDataType> b_m(f_host_tensor_descriptor1d(M, 1)); | ||
| Tensor<ABDataType> c_m(f_host_tensor_descriptor1d(M, 1)); | ||
|
|
||
| a_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
| b_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
|
|
||
| DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace()); | ||
| DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpace()); | ||
| DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpace()); | ||
|
|
||
| a_m_device_buf.ToDevice(a_m.mData.data()); | ||
| b_m_device_buf.ToDevice(b_m.mData.data()); | ||
|
|
||
| auto broadcastAdd = DeviceElementwiseAddInstance{}; | ||
| auto argument = broadcastAdd.MakeArgumentPointer(a_m_device_buf.GetDeviceBuffer(), | ||
| b_m_device_buf.GetDeviceBuffer(), | ||
| c_m_device_buf.GetDeviceBuffer(), | ||
| {M}, | ||
| {1}, | ||
| {1}, | ||
| {1}, | ||
| Add{}); | ||
|
|
||
| if(!broadcastAdd.IsSupportedArgument(argument.get())) | ||
| { | ||
| throw std::runtime_error("The runtime parameters seems not supported by the " | ||
| "DeviceBinaryElementwise_2D instance, exiting!"); | ||
| }; | ||
|
|
||
| auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); | ||
| float ave_time = | ||
| broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); | ||
|
|
||
| std::cout << "Perf: " << ave_time << " ms" << std::endl; | ||
|
|
||
| bool pass = true; | ||
| if(do_verification) | ||
| { | ||
| c_m_device_buf.FromDevice(c_m.mData.data()); | ||
| Tensor<CDataType> host_c_m(f_host_tensor_descriptor1d(M, 1)); | ||
|
|
||
| host_elementwise1D<Tensor<ABDataType>, | ||
| Tensor<ABDataType>, | ||
| Tensor<CDataType>, | ||
| EltwiseComputeDataType, | ||
| Add>(host_c_m, a_m, b_m, M, Add{}); | ||
|
|
||
| pass &= ck::utils::check_err( | ||
| c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); | ||
| } | ||
|
|
||
| return pass ? 0 : 1; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| #include <iostream> | ||
| #include <cstdlib> | ||
| #include "check_err.hpp" | ||
| #include "config.hpp" | ||
| #include "device.hpp" | ||
| #include "host_tensor.hpp" | ||
| #include "host_tensor_generator.hpp" | ||
| #include "host_utility.hpp" | ||
|
|
||
| #include "device_tensor.hpp" | ||
| #include "binary_element_wise_operation.hpp" | ||
| #include "device_binary_elementwise.hpp" | ||
|
|
||
| using F16 = ck::half_t; | ||
| using F32 = float; | ||
|
|
||
| using ABDataType = F16; | ||
| using CDataType = F16; | ||
| using EltwiseComputeDataType = F32; | ||
|
|
||
| using Add = ck::tensor_operation::binary_element_wise::Add; | ||
|
|
||
| using DeviceElementwiseAddInstance = ck::tensor_operation::device:: | ||
| DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 4, 8>; | ||
|
|
||
| template <typename HostTensorA, | ||
| typename HostTensorB, | ||
| typename HostTensorC, | ||
| typename ComputeDataType, | ||
| typename Functor> | ||
| void host_elementwise4D(HostTensorC& C, | ||
| const HostTensorA& A, | ||
| const HostTensorB& B, | ||
| const std::vector<std::size_t>& shape, | ||
| Functor functor) | ||
| { | ||
| using ctype = ck::remove_reference_t<decltype(C(0, 0, 0, 0))>; | ||
|
|
||
| for(std::size_t n = 0; n < shape[0]; ++n) | ||
| for(std::size_t c = 0; c < shape[1]; ++c) | ||
| for(std::size_t h = 0; h < shape[2]; ++h) | ||
| for(std::size_t w = 0; w < shape[3]; ++w) | ||
| { | ||
| ComputeDataType a_val = static_cast<ComputeDataType>(A(n, c, h, w)); | ||
| ComputeDataType b_val = static_cast<ComputeDataType>(B(n, c, h, w)); | ||
| ComputeDataType c_val = 0; | ||
| functor(c_val, a_val, b_val); | ||
| C(n, c, h, w) = static_cast<ctype>(c_val); | ||
| } | ||
| } | ||
|
|
||
| int main() | ||
| { | ||
| bool do_verification = true; | ||
| bool time_kernel = false; | ||
|
|
||
| std::vector<std::size_t> nchw = {4, 16, 32, 32}; | ||
|
|
||
| Tensor<ABDataType> a_m(nchw); | ||
| Tensor<ABDataType> b_m(nchw); | ||
| Tensor<ABDataType> c_m(nchw); | ||
|
|
||
| a_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
| b_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0}); | ||
|
|
||
| DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace()); | ||
| DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpace()); | ||
| DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpace()); | ||
|
|
||
| a_m_device_buf.ToDevice(a_m.mData.data()); | ||
| b_m_device_buf.ToDevice(b_m.mData.data()); | ||
|
|
||
| auto broadcastAdd = DeviceElementwiseAddInstance{}; | ||
| auto argument = broadcastAdd.MakeArgumentPointer( | ||
| a_m_device_buf.GetDeviceBuffer(), | ||
| b_m_device_buf.GetDeviceBuffer(), | ||
| c_m_device_buf.GetDeviceBuffer(), | ||
| ck::convert_vector_element_type<std::size_t, ck::index_t>(nchw), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion. |
||
| ck::convert_vector_element_type<std::size_t, ck::index_t>(a_m.mDesc.GetStrides()), | ||
| ck::convert_vector_element_type<std::size_t, ck::index_t>(b_m.mDesc.GetStrides()), | ||
| ck::convert_vector_element_type<std::size_t, ck::index_t>(c_m.mDesc.GetStrides()), | ||
| Add{}); | ||
|
|
||
| if(!broadcastAdd.IsSupportedArgument(argument.get())) | ||
| { | ||
| throw std::runtime_error("The runtime parameters seems not supported by the " | ||
| "DeviceBinaryElementwise_2D instance, exiting!"); | ||
| }; | ||
|
|
||
| auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer(); | ||
| float ave_time = | ||
| broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); | ||
|
|
||
| std::cout << "Perf: " << ave_time << " ms" << std::endl; | ||
|
|
||
| bool pass = true; | ||
| if(do_verification) | ||
| { | ||
| c_m_device_buf.FromDevice(c_m.mData.data()); | ||
| Tensor<CDataType> host_c_m(nchw); | ||
|
|
||
| host_elementwise4D<Tensor<ABDataType>, | ||
| Tensor<ABDataType>, | ||
| Tensor<CDataType>, | ||
| EltwiseComputeDataType, | ||
| Add>(host_c_m, a_m, b_m, nchw, Add{}); | ||
|
|
||
| pass &= ck::utils::check_err( | ||
| c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3); | ||
| } | ||
|
|
||
| return pass ? 0 : 1; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo, CDataType
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I send another PR
#242