Skip to content

Commit

Permalink
parent 78959a3
Browse files Browse the repository at this point in the history
author Leo Chen <[email protected]> 1617782111 +0800
committer liuwei88 <[email protected]> 1617789509 +0800

[NPU] support cann 20.3 (PaddlePaddle#32044)

* fix compile problem on cann 20.3

* fix ut

* fix test_mul

* fix check_finite_and_scale

* fix lookup_table_v2_grad

* fix cmake

* support print op

add reduce op

add more tests

add padding set

add reduce sum op and try to pr

add reduce sum op and try to pr 2

test all op successful in 28 but failed in 24(broadcast and reducescatter)

add send/recv op

change some file type
  • Loading branch information
zhiqiu authored and lw921014 committed Apr 7, 2021
1 parent 78959a3 commit 3701d55
Show file tree
Hide file tree
Showing 44 changed files with 1,747 additions and 833 deletions.
9 changes: 6 additions & 3 deletions cmake/external/ascend.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()

if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
# It means CANN 20.2 +
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()

if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
Expand All @@ -43,9 +48,7 @@ if(WITH_ASCEND)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})

if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()


ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type,
// framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
// Inputs, outputs and attrs will be set to empty map
// to improve the execution efficiency of dygraph.

if (inputs_.size() > 0 || outputs_.size() > 0) {
GenerateTemporaryNames();
CheckAllInputOutputSet();
Expand Down
12 changes: 12 additions & 0 deletions paddle/fluid/framework/var_type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
#endif
#endif

#ifdef PADDLE_WITH_ASCEND_CL
#include <hccl/hccl.h>
#include <hccl/hccl_types.h>
#endif

#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
Expand All @@ -45,6 +50,10 @@ class Communicator;
class NCCLCommunicator;
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class Communicator;
class HCCLCommunicator;
#endif

#if defined(PADDLE_WITH_XPU_BKCL)
class BKCLCommunicator;
Expand Down Expand Up @@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
#endif
operators::CudnnRNNCache,
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo,
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId, platform::BKCLCommunicator,
#endif
Expand Down
1 change: 0 additions & 1 deletion paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ endif()

if (WITH_ASCEND_CL)
cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
endif()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {

size_t x_size = xs.size();
for (size_t i = 0; i < x_size; ++i) {
found_inf_data = true;
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
Expand All @@ -77,6 +76,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
NpuOpRunner("CheckNumerics", {*x}, {check_xout},
{{"message", std::string("check_nan_and_inf")}});
runner_checknumerics.Run(stream);
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
} catch (platform::EnforceNotMet& exception) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
// out found_inf
Tensor found_inf_tensor;
found_inf_tensor.Resize({1});
bool *is_finite_data =
bool *found_inf_data =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
f::TensorCopy(*found_inf, place, &found_inf_tensor);
EXPECT_FALSE(*is_finite_data);
EXPECT_TRUE(*found_inf_data);

ctx.Wait();
}
Expand Down
26 changes: 15 additions & 11 deletions paddle/fluid/operators/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ foreach(src ${OPS})
set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
endforeach()

register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})

if(WITH_NCCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
Expand All @@ -24,39 +24,43 @@ if(WITH_GLOO)
endif()

if(WITH_XPU_BKCL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common)
op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
endif()

if(WITH_ASCEND_CL)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
endif()

set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")

if(WITH_ASCEND_CL)
set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
gen_hccl_id_op op_registry ascend_hccl flags
dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
endif()
14 changes: 6 additions & 8 deletions paddle/fluid/operators/collective/c_allgather_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
HcclDataType dtype = platform::ToHCCLDataType(in->type());

int ring_id = ctx.Attr<int>("ring_id");
std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto place = ctx.GetPlace();
auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
int nranks = comm->nranks();
std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());

framework::DDim out_dims = in->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);

int64_t send_numel = in->numel();
uint64_t send_numel = in->numel();
void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void *recv_buff = reinterpret_cast<void*>(out->data<T>());

Expand All @@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
VLOG(3) << "begin hccl allgather, parameter is: "
<< ", group is " << group
<< ", ring_id is " << ring_id
<< ", nranks is " << nranks
<< ", tag is " << tag;
<< ", nranks is " << nranks;

PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
group.c_str(), (void*)stream));
PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
send_buff, recv_buff, send_numel, dtype,
comm->comm(), (void*)stream));

#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
Expand Down
82 changes: 63 additions & 19 deletions paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
Expand All @@ -45,7 +46,8 @@ namespace p = paddle::platform;
namespace m = paddle::operators::math;

USE_OP(c_allgather);
USE_NO_KERNEL_OP(c_comm_init_hcom);
USE_NO_KERNEL_OP(c_gen_hccl_id);
USE_NO_KERNEL_OP(c_comm_init_hccl);
USE_OP_DEVICE_KERNEL(c_allgather, NPU);

DECLARE_string(selected_npus);
Expand All @@ -56,41 +58,81 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
for (auto ele : data) {
debugstring += std::to_string(ele) + std::string(",");
}
VLOG(2) << preStr << ":" << std::endl <<debugstring;
VLOG(2) << preStr << ":" << std::endl <<debugstring;
}

void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){

int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));

VLOG(2) << "rank_id = " << rank_id
<< "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
<< "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

std::vector<int> rank_ids{0, 1};
f::AttributeMap gen_hccl_id;


std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
gen_hccl_id["rank"] = rank_id;
gen_hccl_id["endpoint"] = endpointList[rank_id];
std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
gen_hccl_id["other_endpoints"] = other_endpoints;

auto out = scope->Var("Out");
auto id = out->GetMutable<HcclRootInfo>();

VLOG(3) << "break";

auto comm_init_op =
f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
VLOG(3) << "break";
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();

memcpy(hccl_id, id, 1024);
}

void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){

auto x = scope->Var("X");
auto id = x->GetMutable<HcclRootInfo>();

memcpy(id, hccl_id, 1024);

int rank_id = atoi(getenv("RANK_ID"));
int device_id = atoi(getenv("DEVICE_ID"));

VLOG(2) << "rank_id = " << rank_id
<< "; device_id = " << device_id
<< "; rank_id = " << rank_id
<< "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

// std::vector<int> rank_ids{0, 1};
f::AttributeMap comm_init_attrs;
comm_init_attrs["ring_id"] = 0;
comm_init_attrs["nranks"] = 2;
comm_init_attrs["rank_ids"] = 2;
comm_init_attrs["rank"] = rank_id;
comm_init_attrs["device_id"] = device_id;
comm_init_attrs["rank_ids"] = rank_ids;
// comm_init_attrs["rank_ids"] = rank_ids;
auto comm_init_op =
f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
auto place = ctx.GetPlace();
comm_init_op->Run(*scope, place);
ctx.Wait();
}

void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("X");
auto x = scope->Var("Data");
auto tensor_x = x->GetMutable<f::LoDTensor>();

std::vector<float> init;
int rank_id = atoi(getenv("RANK_ID"));

int num1 = 1;
int num2 = 4;

Expand All @@ -104,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
ctx.Wait();

auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto out = scope->Var("OutData");
auto tensor_out = out->GetMutable<f::LoDTensor>();
tensor_out->Resize({num1, num2});
tensor_out->mutable_data<float>(place); // allocate
Expand All @@ -115,15 +157,15 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
attrs["tag"]=std::string("tagx");
attrs["ring_id"]=0;
attrs["nranks"]=2;
auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
{{"Out", {"Out"}}}, attrs);

auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
{{"Out", {"OutData"}}}, attrs);

for (int i = 0; i < 10; i ++) {
op->Run(*scope, place);
}
ctx.Wait();

std::vector<float> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
Expand All @@ -142,10 +184,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {

TEST(c_allgather, NPU) {
f::Scope scope;
HcclRootInfo hccl_id;

// only support one device, if more than one device, use first default
// only support one device, if more than one device, use first default
p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

Prepare(&scope, ctx);
PrepareUniqueId(&scope, ctx, &hccl_id);
Prepare(&scope, ctx, &hccl_id);
TestHCCLAllGatherOp(&scope, ctx);
}
Loading

0 comments on commit 3701d55

Please sign in to comment.