Skip to content

Commit

Permalink
[NPU] enable async copy and add wait before sync operation (PaddlePad…
Browse files Browse the repository at this point in the history
…dle#31956)

* enable async copy and  add wait before sync operation

* remove unneccessary wait

* add FillNpuTensorWithConstant

* refine

* fix fill_constant

* make TensorFromVector/TensorToVector sync
  • Loading branch information
zhiqiu committed Apr 15, 2021
1 parent 7c05f46 commit 8077a33
Show file tree
Hide file tree
Showing 7 changed files with 78 additions and 38 deletions.
32 changes: 16 additions & 16 deletions paddle/fluid/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,15 @@ void TensorFromVector(const std::vector<T>& src,
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
}
Expand Down Expand Up @@ -202,10 +206,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
delete[] array;
Expand Down Expand Up @@ -265,10 +267,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
}
Expand Down Expand Up @@ -301,10 +302,9 @@ inline void TensorToVector(const Tensor& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
Expand Down
29 changes: 17 additions & 12 deletions paddle/fluid/memory/memcpy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,19 +207,19 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,

platform::SetNPUDeviceId(dst_place.device);

// NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;

VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";

if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
Expand All @@ -235,19 +235,16 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,

platform::SetNPUDeviceId(src_place.device);

// NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;

VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";

if (stream) {
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
Expand All @@ -270,6 +267,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
Expand All @@ -284,6 +285,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();

platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});

ctx.Wait();

auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
Expand All @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs);

op->Run(*scope, place);
ctx.Wait();

std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
Expand Down Expand Up @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});

ctx.Wait();

// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
Expand All @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,

auto place = ctx.GetPlace();
op->Run(*scope, place);
ctx.Wait();

std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/operators/fill_constant_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {

Tensor tensor_tmp(data_type);
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
std::vector<T> init = {value};
TensorFromVector(init, ctx.device_context(), &tensor_tmp);
FillNpuTensorWithConstant<T>(&tensor_tmp, value);

out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/operators/npu_op_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second;
}

aclrtStream GetCurrentNPUStream() {
int device_id = platform::GetCurrentNPUDeviceId();
aclrtStream GetCurrentNPUStream(int device_id) {
if (device_id == -1) {
device_id = platform::GetCurrentNPUDeviceId();
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id)));
Expand Down Expand Up @@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret);
}

} // namespace operators
} // namespace paddle
38 changes: 38 additions & 0 deletions paddle/fluid/operators/npu_op_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,44 @@ class NpuOpRunner {

aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);

aclrtStream GetCurrentNPUStream(int device_id = -1);

template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
}
}

} // namespace operators
} // namespace paddle
#endif
1 change: 1 addition & 0 deletions paddle/fluid/platform/device_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ NPUDeviceContext::~NPUDeviceContext() {
void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event("NPUDeviceContext/wait");
NPUDeviceGuard guard(place_.device);
VLOG(4) << "NPU context Wait";
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}

Expand Down

0 comments on commit 8077a33

Please sign in to comment.