Skip to content

Commit

Permalink
Merge pull request #27 from PaddlePaddle/develop
Browse files Browse the repository at this point in the history
update
  • Loading branch information
AnnaTrainingG authored Sep 3, 2021
2 parents e1a92d6 + f13dcfb commit 05da032
Show file tree
Hide file tree
Showing 129 changed files with 9,609 additions and 1,192 deletions.
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210818")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
1 change: 1 addition & 0 deletions cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ function(op_library TARGET)
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
list(REMOVE_ITEM hip_srcs "svd_op.cu")
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ message PipelineConfig {
optional int32 micro_batch_size = 1 [ default = 1 ];
optional int32 accumulate_steps = 2 [ default = 1 ];
optional string schedule_mode = 3 [ default = '1F1B' ];
optional bool p2p_cache_shape = 4 [ default = true ];
}

message TensorParallelConfig {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Optimizer {
if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;

add_g2sum = scaled_grad * scaled_grad;
add_g2sum += scaled_grad * scaled_grad;

g2sum += add_g2sum;
}
Expand All @@ -64,7 +64,7 @@ class Optimizer {
w[i] = optimizer_config::mf_min_bound;
if (w[i] > optimizer_config::mf_max_bound)
w[i] = optimizer_config::mf_max_bound;
add_g2sum = scaled_grad * scaled_grad;
add_g2sum += scaled_grad * scaled_grad;
}

g2sum += add_g2sum / n;
Expand Down
17 changes: 4 additions & 13 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
main_program_(main_prog),
global_scope_(global_scope),
d2h_ctx_pool_({place}),
h2d_ctx_pool_({place}),
fetch_context_pool_({place}) {
h2d_ctx_pool_({place}) {
is_build_ = false;

garbages_.reset(new GarbageQueue());
Expand Down Expand Up @@ -339,9 +338,6 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
new RuntimeInferShapeContext(*op_base, *instr_node->runtime_ctx_.get()));

auto* dev_ctx = instr_node->dev_ctx_;
if (instr_node->kernel_func_.operator_base_->Type() == "fetch_v2") {
dev_ctx = fetch_context_pool_.Get(place);
}
Scope scope;

instr_node->execution_ctx_.reset(new ExecutionContext(
Expand All @@ -356,12 +352,6 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
instr_node.kernel_func_.operator_base_)
->InferShape(instr_node.infershape_ctx_.get());

if (instr_node.kernel_func_.operator_base_->Type() == "fetch_v2") {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place_);
dev_ctx->Wait(); // TODO(wanghuancoder)
}

instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
}

Expand Down Expand Up @@ -411,8 +401,6 @@ void InterpreterCore::ExecuteInstructionList(
working_var_ref);
}

fetch_context_pool_.Get(place)->Wait();

for (size_t i = 0; i < working_var_ref.size(); ++i) {
if (working_var_ref[i].var_ref_count_ != 0) {
std::cerr << " var ref is not zero " << i << std::endl;
Expand Down Expand Up @@ -671,6 +659,9 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
expected_kernel_key);
if (!platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_key.place_)) {
if (op_base->Type() == "fetch_v2") {
op_base->SetAttr("deepcopy", false);
}
// need trans place
// 1. add var in scope
// 2. add copy op
Expand Down
2 changes: 0 additions & 2 deletions paddle/fluid/framework/new_executor/interpretercore.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ class InterpreterCore {
size_t max_memory_size_;
size_t cur_memory_size_;
std::unique_ptr<WorkQueue> gc_queue_;

platform::DeviceContextPool fetch_context_pool_;
};
} // namespace framework
} // namespace paddle
8 changes: 4 additions & 4 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1254,10 +1254,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
}
#endif
#ifdef PADDLE_WITH_XPU
if ((kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_) &&
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) ||
paddle::platform::is_in_xpu_black_list(type_)) {
if (is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_))) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down
36 changes: 26 additions & 10 deletions paddle/fluid/framework/ps_gpu_trainer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
trainer_desc.downpour_param().stat_var_names(i));
}
VLOG(3) << "going to initialize pull dense worker";
pull_dense_worker_ = PullDenseWorker::GetInstance();
pull_dense_worker_->Initialize(trainer_desc);
SetDebug(trainer_desc.debug());
trainer_desc_ = trainer_desc;
workers_.resize(place_num);
Expand Down Expand Up @@ -112,15 +110,21 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program,
}
}
}
for (auto& var : main_program.Block(0).AllVars()) {
if (var->Persistable()) {
auto it = std::find(need_merge_var_names_.begin(),
need_merge_var_names_.end(), var->Name());
if (it == need_merge_var_names_.end()) {
VLOG(2) << "train param: " << var->Name();
trainable_param_.push_back(var->Name());
}
}
}
place_ = place;
return;
}

void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) {
pull_dense_worker_->SetRootScope(root_scope_);
for (size_t i = 0; i < places_.size(); ++i) {
pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
}
VLOG(3) << "init other env done.";
}

Expand All @@ -141,15 +145,27 @@ Scope* PSGPUTrainer::GetWorkerScope(int thread_id) { return nullptr; }
template <typename T>
void PSGPUTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
LoDTensor tmp_root;
TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
TensorCopySync(*root_tensor, platform::CPUPlace(), &tmp_root);
T* tmp_root_data = tmp_root.data<T>();
LoDTensor tmp_tensor;
TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
TensorCopySync(*tensor, platform::CPUPlace(), &tmp_tensor);
T* data = tmp_tensor.data<T>();
for (int i = 0; i < tmp_tensor.numel(); i++) {
tmp_root_data[i] += data[i];
}
TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
TensorCopySync(tmp_root, platform::CPUPlace(), root_tensor);
}

void PSGPUTrainer::MergeDenseParam() {
auto thread_scope = workers_[0]->GetThreadScope();
for (auto& name : trainable_param_) {
VLOG(2) << "merge var " << name << " to root scope";
Variable* root_var = root_scope_->FindVar(name);
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
Variable* var = thread_scope->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
TensorCopySync((*tensor), root_tensor->place(), root_tensor);
}
}

void PSGPUTrainer::Finalize() {
Expand Down Expand Up @@ -187,7 +203,7 @@ void PSGPUTrainer::Finalize() {
_ForEachDataType_(MergeCallback);
}
}
pull_dense_worker_->MergeDenseParam();
MergeDenseParam();
root_scope_->DropKids();
}
} // namespace framework
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/trainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ class PSGPUTrainer : public TrainerBase {
}
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
virtual void MergeDenseParam();

template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
Expand All @@ -274,6 +275,7 @@ class PSGPUTrainer : public TrainerBase {
DownpourWorkerParameter param_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
std::vector<std::string> need_merge_var_names_;
std::vector<std::string> trainable_param_;
float scale_datanorm_;
paddle::platform::Place place_;
ProgramDesc program_;
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/imperative/prepared_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if ((kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_) &&
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) ||
paddle::platform::is_in_xpu_black_list(op.Type())) {
if (is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) {
VLOG(3) << "missing XPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down
10 changes: 6 additions & 4 deletions paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {

// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
constexpr int EXTRA_PADDING_SIZE = 32;
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
Expand Down Expand Up @@ -257,10 +258,11 @@ class NPUBuddyAllocatorList {

std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
Expand Down
12 changes: 9 additions & 3 deletions paddle/fluid/memory/detail/buddy_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ namespace detail {

BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t max_chunk_size)
size_t max_chunk_size, size_t extra_padding_size)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
extra_padding_size_(extra_padding_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}

Expand All @@ -59,9 +60,14 @@ inline size_t align(size_t size, size_t alignment) {

void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment
size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);

size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc) + extra_padding_size_,
min_chunk_size_);
VLOG(10) << "alloc: " << unaligned_size
<< ", padding for desc: " << sizeof(MemoryBlock::Desc)
<< ", extra padding: " << extra_padding_size_
<< ", alignment: " << min_chunk_size_;
// acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);

Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/memory/detail/buddy_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ namespace detail {
class BuddyAllocator {
public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t max_chunk_size);
size_t min_chunk_size, size_t max_chunk_size,
size_t extra_padding_size = 0);

~BuddyAllocator();

Expand Down Expand Up @@ -86,7 +87,9 @@ class BuddyAllocator {
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk

size_t realloc_size_ = 0; // the size of re-allocated chunk
size_t realloc_size_ = 0; // the size of re-allocated chunk
size_t extra_padding_size_ = 0; // the size of padding to the size of memory
// to alloc, especially used in NPU

private:
/**
Expand Down
23 changes: 15 additions & 8 deletions paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"

DECLARE_int32(min_loss_scaling);

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -49,7 +51,7 @@ void Update(const platform::NPUDeviceContext& ctx,

std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] == decr_every_n_nan_or_inf) {
if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
Expand All @@ -60,13 +62,18 @@ void Update(const platform::NPUDeviceContext& ctx,

std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (new_loss_scaling[0] < static_cast<T>(1)) {
float min_value = 1.0;
if (FLAGS_min_loss_scaling > 1) {
min_value = static_cast<float>(FLAGS_min_loss_scaling);
}

if (new_loss_scaling[0] < min_value) {
// updated_loss_scaling_data = 1
const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}});
const auto& runner_p4 = NpuOpRunner(
"Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(min_value)}});

runner_p4.Run(stream);
}
Expand All @@ -93,7 +100,7 @@ void Update(const platform::NPUDeviceContext& ctx,
std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data);

if (good_out_data[0] == incr_every_n_steps) {
if (good_out_data[0] >= incr_every_n_steps) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/batch_norm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -848,7 +848,8 @@ void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
}

// used when setting use_global_stats True during training
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats")) ||
BOOST_GET_CONST(bool, this->GetAttr("is_test"))) {
op->SetInput("Mean", this->Output("MeanOut"));
op->SetInput("Variance", this->Output("VarianceOut"));
}
Expand Down
Loading

0 comments on commit 05da032

Please sign in to comment.