Skip to content

Commit

Permalink
[NPU] Support npu save load (PaddlePaddle#31893)
Browse files Browse the repository at this point in the history
* support save load for NPU

* add save load npu unittest

* support np.array transform in NPU

* fix errors

* delete dygraph in unittest

* add Wait

* fix unittest

* fix review comment

* fix unittest problem

* fix little problem
  • Loading branch information
pangyoki authored and zhiqiu committed Apr 15, 2021
1 parent 1d699b2 commit 1ef2b93
Show file tree
Hide file tree
Showing 9 changed files with 355 additions and 40 deletions.
49 changes: 43 additions & 6 deletions paddle/fluid/framework/tensor_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_npu_place(tensor.place())) {
#ifdef PADDLE_WITH_ASCEND_CL
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& npu_dev_ctx =
static_cast<const platform::NPUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
npu_dev_ctx.stream());
npu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
Expand Down Expand Up @@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
Expand All @@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
Expand Down Expand Up @@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
Expand All @@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
Expand Down
25 changes: 25 additions & 0 deletions paddle/fluid/operators/load_combine_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/load_combine_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
load_combine,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
24 changes: 24 additions & 0 deletions paddle/fluid/operators/load_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/load_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
24 changes: 24 additions & 0 deletions paddle/fluid/operators/save_combine_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/save_combine_op.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
save_combine,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
28 changes: 28 additions & 0 deletions paddle/fluid/operators/save_op_npu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/save_op.h"
#include "paddle/fluid/platform/float16.h"

namespace ops = paddle::operators;

REGISTER_OP_NPU_KERNEL(
save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
31 changes: 30 additions & 1 deletion paddle/fluid/pybind/tensor_py.h
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
}
bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
bool is_npu_tensor = platform::is_npu_place(tensor.place());
const auto &tensor_dims = tensor.dims();
auto tensor_dtype = tensor.type();
size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
Expand All @@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,

std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());

if (!is_gpu_tensor && !is_xpu_tensor) {
if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
if (!need_deep_copy) {
auto base = py::cast(std::move(tensor));
return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
Expand Down Expand Up @@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
} else if (is_npu_tensor) {
#ifdef PADDLE_WITH_ASCEND_CL
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));

size_t copy_bytes = sizeof_dtype * numel;
auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
copy_bytes,
reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
#endif
}
PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
Expand Down
8 changes: 8 additions & 0 deletions python/paddle/fluid/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2041,6 +2041,10 @@ def set_var(var, ndarray):
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.NPUPlace(p.npu_device_id())
else:
p = paddle.fluid.core.Place()
p.set_place(t._place())
Expand Down Expand Up @@ -2335,6 +2339,10 @@ def set_program_state(program, state_dict):
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif ten_place.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.NPUPlace(p.npu_device_id())

ten.set(new_para_np, py_place)

Expand Down
108 changes: 108 additions & 0 deletions python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import print_function

import unittest
import sys
sys.path.append("..")
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import Adam
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from paddle.fluid.executor import global_scope
import numpy as np
import six
import pickle
import os
import errno
from test_static_save_load import *

paddle.enable_static()


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadBase(TestSaveLoadBase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadPartial(TestSaveLoadPartial):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStatePartial(TestProgramStatePartial):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStateOldSave(TestProgramStateOldSave):
def setUp(self):
self.test_dygraph = False

def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)


if __name__ == '__main__':
paddle.enable_static()
unittest.main()
Loading

0 comments on commit 1ef2b93

Please sign in to comment.