Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/develop' into faster-roi-clean
Browse files Browse the repository at this point in the history
  • Loading branch information
tsocha committed Dec 15, 2021
2 parents 3f898c0 + 5082249 commit 0ab9a06
Show file tree
Hide file tree
Showing 65 changed files with 1,455 additions and 479 deletions.
2 changes: 1 addition & 1 deletion cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ endif (WIN32)

if (WITH_INFRT)
include(external/llvm)
list(APPEND third_party_deps external_llvm)
list(APPEND third_party_deps ${llvm_libs})
endif()

if (WITH_IPU)
Expand Down
37 changes: 11 additions & 26 deletions paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
} else {
const char* FWD_INS_CONTENT_TEMPLATE =
" if(%s.initialized()) "
" if(%s.safe_initialized()) "
"ins[\"%s\"] = egr::EagerUtils::SyncToVars(%s)\n;";
generated_function_body += paddle::string::Sprintf(
FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
Expand All @@ -1196,25 +1196,21 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
// in form of shared_ptr<EagerTensor>/vector<shared_ptr<EagerTensor>>
if (output.duplicable()) {
const char* FWD_NUM_ARG_TEMPLATE =
", std::vector<std::shared_ptr<egr::EagerTensor>>& %s";
", std::vector<egr::EagerTensor>& %s";
std::string arg_str =
paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
dygraph_function_args_str += arg_str;

const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
} else {
const char* FWD_NUM_ARG_TEMPLATE =
", std::shared_ptr<egr::EagerTensor>& %s";
const char* FWD_NUM_ARG_TEMPLATE = ", egr::EagerTensor& %s";
std::string arg_str =
paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
dygraph_function_args_str += arg_str;

const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", {%s} },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
}
const char* FWD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(&%s) },";
outs_contents_str += paddle::string::Sprintf(
FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);

} else {
if (output.duplicable()) {
Expand Down Expand Up @@ -1557,22 +1553,11 @@ static std::string GenerateGradNodeCCContents(
"fwd_outputs_name_pos_map"));

size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
std::string grad_ptr_name = fwd_name + "_ptrs";
const char* GET_GRADS_PTR_TEMPLATE =
" std::vector<std::shared_ptr<egr::EagerTensor>> %s;\n"
" for(const auto& t : grads[%d]) {\n "
"%s.emplace_back(std::move(std::make_shared<egr::EagerTensor>(t))"
");"
"\n }\n";
std::string grads_ptr_str =
paddle::string::Sprintf(GET_GRADS_PTR_TEMPLATE, grad_ptr_name,
grads_position, grad_ptr_name);
generated_grad_function_body += grads_ptr_str;
generated_grad_function_body += "\n";

const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", %s },";

const char* GRAD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::SyncToVars(grads[%d]) },";
outs_contents_str += paddle::string::Sprintf(
GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grad_ptr_name);
GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);

} else {
size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/eager/eager_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ class EagerTensor final {
*/
bool initialized() const { return tensor_->initialized(); }

bool safe_initialized() const {
return initialized() || var_.IsInitialized();
}

/**
* @description: Reset the Tensor implementation
* @param None
Expand Down
24 changes: 24 additions & 0 deletions paddle/fluid/eager/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,30 @@ std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::SyncToVars(
return res;
}

static std::shared_ptr<egr::EagerTensor> TrySyncToVar(
egr::EagerTensor* tensor) {
if (tensor->initialized() || tensor->Var().IsInitialized()) {
tensor->SyncToVar(paddle::framework::proto::VarType_Type_LOD_TENSOR);
}
return std::make_shared<EagerTensor>(*tensor);
}

std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
egr::EagerTensor* tensor) {
return {TrySyncToVar(tensor)};
}

std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::TrySyncToVars(
std::vector<egr::EagerTensor>* tensors) {
std::vector<std::shared_ptr<EagerTensor>> res;
size_t num = tensors->size();
res.reserve(num);
for (size_t i = 0; i < num; i++) {
res.emplace_back(TrySyncToVar(&(*tensors)[i]));
}
return res;
}

/* ---- VarBase -> Tensor ---- */
std::vector<std::shared_ptr<egr::EagerTensor>> EagerUtils::SyncToTensors(
const egr::EagerTensor& tensor) {
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/eager/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ class EagerUtils {
const std::shared_ptr<GradNodeBase>& grad_node);

// Intermidate needed remove this once we don't need legacy
static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
egr::EagerTensor* tensor);
static std::vector<std::shared_ptr<egr::EagerTensor>> TrySyncToVars(
std::vector<egr::EagerTensor>* tensors);

static std::vector<std::shared_ptr<egr::EagerTensor>> SyncToVars(
const egr::EagerTensor& tensor);
static std::vector<std::shared_ptr<egr::EagerTensor>> SyncToVars(
Expand Down
14 changes: 7 additions & 7 deletions paddle/fluid/framework/custom_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
const std::vector<std::string>& outputs,
const std::vector<std::string>& attrs) {
VLOG(1) << "Custom Operator: Start run KernelFunc.";
std::vector<paddle::Tensor> custom_ins;
std::vector<std::vector<paddle::Tensor>> custom_vec_ins;
std::vector<paddle::experimental::Tensor> custom_ins;
std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
for (auto& in_name : inputs) {
VLOG(1) << "Custom Operator: input name - " << in_name;
if (detail::IsDuplicableVar(in_name)) {
Expand All @@ -120,7 +120,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
PADDLE_ENFORCE_NE(vec_x.empty(), true,
platform::errors::NotFound(
"Input vector<tensor> (%s) is empty.", in_name));
std::vector<paddle::Tensor> custom_vec_in;
std::vector<paddle::experimental::Tensor> custom_vec_in;
for (size_t i = 0; i < vec_x.size(); ++i) {
auto* x = vec_x[i];
PADDLE_ENFORCE_NOT_NULL(
Expand All @@ -132,7 +132,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
"The %d-th tensor in input vector<tensor> (%s) "
"is not initialized.",
i, in_name));
paddle::Tensor custom_t;
paddle::experimental::Tensor custom_t;
custom_t.set_impl(std::move(experimental::MakePtenDenseTensor(*x)));
custom_vec_in.emplace_back(custom_t);
}
Expand All @@ -144,7 +144,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
platform::errors::InvalidArgument(
"Input tensor (%s) is not initialized.", in_name));
paddle::Tensor custom_in;
paddle::experimental::Tensor custom_in;
custom_in.set_impl(std::move(experimental::MakePtenDenseTensor(*x)));
custom_ins.emplace_back(custom_in);
}
Expand Down Expand Up @@ -207,14 +207,14 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
"Tensors.",
vec_true_outs.size(), outs.size()));
for (size_t j = 0; j < vec_true_outs.size(); ++j) {
experimental::MovesStorage(
experimental::MovesSharedStorage(
std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(j).impl())
.get(),
vec_true_outs.at(j));
}
} else {
auto* true_out = ctx.Output<Tensor>(out_name);
experimental::MovesStorage(
experimental::MovesSharedStorage(
std::dynamic_pointer_cast<pten::DenseTensor>(outs.at(i).impl())
.get(),
true_out);
Expand Down
17 changes: 17 additions & 0 deletions paddle/fluid/framework/ir/graph_pattern_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2412,6 +2412,23 @@ PDNode *patterns::OrphanedBfloat16::operator()() {
return next_op;
}

PDNode *patterns::UnsupportedBfloat16::operator()() {
auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
prev_op->assert_more([&](Node *node) {
return node->Op()->HasAttr("mkldnn_data_type") == false;
});
auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();

auto *op = pattern->NewNode(op_repr())->assert_is_op();
op->assert_more([&](Node *node) {
return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
"bfloat16";
});
prev_op->LinksTo({prev_out});
op->LinksFrom({prev_out});
return op;
}

PDNode *patterns::LastBfloat16Ops::operator()() {
auto *op = pattern->NewNode(op_repr())->assert_is_op();
op->assert_more([&](Node *node) {
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/framework/ir/graph_pattern_detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,16 @@ struct OrphanedBfloat16 : public PatternBase {
PATTERN_DECL_NODE(next_op);
};

struct UnsupportedBfloat16 : public PatternBase {
UnsupportedBfloat16(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "unsupported_bfloat16") {}
PDNode* operator()();

PATTERN_DECL_NODE(prev_op);
PATTERN_DECL_NODE(prev_out);
PATTERN_DECL_NODE(op);
};

struct LastBfloat16Ops : public PatternBase {
LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
Expand Down
4 changes: 1 addition & 3 deletions paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@
// limitations under the License.

#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"

#include "paddle/fluid/platform/device/ipu/ipu_backend.h"

#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/device/ipu/ipu_backend.h"

namespace paddle {
namespace framework {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ ConvConcatReLUFusePass::ConvConcatReLUFusePass() {
.IsType<std::vector<int>>()
.End()
.AddAttr("data_format")
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.IsStringIn({"NCHW"})
.End();

AddOpCompat(OpCompat("concat"))
Expand Down
21 changes: 21 additions & 0 deletions paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,31 @@ void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
gpd(graph, handler);
}

void CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
ir::Graph* graph, int* bfloat16_operators) const {
// now quantize is supported FP32 only, so try to find
// bfloat16 operator that input type is not FP32
GraphPatternDetector gpd;
patterns::UnsupportedBfloat16 unsupported_bfloat16_pattern{
gpd.mutable_pattern(), "unsupported_bfloat16"};
unsupported_bfloat16_pattern();
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, unsupported_bfloat16_pattern);
GET_IR_NODE_FROM_SUBGRAPH(op, op, unsupported_bfloat16_pattern);
if ((prev_out->Var()->GetDataType() != proto::VarType::FP32)) {
op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
bfloat16_operators--;
}
};
gpd(graph, handler);
}

void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
int bfloat16_operators = 0;
SetMkldnnDataType(graph, &bfloat16_operators);
RemoveOrphanedOperators(graph, &bfloat16_operators);
RemoveUnsupportedOperators(graph, &bfloat16_operators);
PrettyLogDetail("--- marked %d operators to bfloat16 ",
bfloat16_operators);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ class CPUBfloat16PlacementPass : public Pass {

void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;

void RemoveUnsupportedOperators(ir::Graph* graph,
int* bfloat16_operators) const;

void ApplyImpl(ir::Graph* graph) const override;
};

Expand Down
54 changes: 53 additions & 1 deletion paddle/fluid/framework/new_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,56 @@ cc_library(stream_analyzer SRCS stream_analyzer.cc DEPS ${DEVICE_EVENT_LIBS} glo
cc_library(interpretercore SRCS interpretercore.cc DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util interpretercore_garbage_collector stream_analyzer event_manager)
cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)

# skip win32 since wget is not installed by default on windows machine.
# skip COVERAGE_CI since the test runs slowly because of instrumentation.
if (WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE)
add_custom_target(
download_program
COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program
COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_startup_program
)

# all operators used in the program
set(OPS
fill_constant_op
uniform_random_op
lookup_table_op
transpose_op
reshape_op
split_op
slice_op
concat_op
matmul_op
elementwise_add_op
elementwise_mul_op
softmax_with_cross_entropy_op
reduce_mean_op
reduce_sum_op
activation_op
sum_op
elementwise_max_op
elementwise_div_op
sgd_op
squared_l2_norm_op
memcpy_h2d_op
memcpy_d2h_op)

# All deps of the operators above, part of GLOB_OPERATOR_DEPS.
set(OP_DEPS
generator
softmax
selected_rows_functor
jit_kernel_helper
concat_and_split
cross_entropy)

cc_test(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${OPS} ${OP_DEPS})
set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100)

add_dependencies(standalone_executor_test download_program)
if (WITH_PROFILER)
target_link_libraries(standalone_executor_test profiler)
add_dependencies(standalone_executor_test profiler)
endif()
endif()
Loading

1 comment on commit 0ab9a06

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.