From c0324e82ec6f8876f48bb59f47eafcd4fe3d6254 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 20 Sep 2022 10:22:33 +0800 Subject: [PATCH] [Cherry-pick] Update layoutautotune for inplace (#45826) (#46226) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cherry-pick from #45826 LayoutAutotune 支持 inplace 类型的OP 根据 Add eager layout autotune #45409 修改意见调整UseAutotune 将LayoutAutotune判断放到controller中,与AMP 判断保持一致 --- .../forwards/conv2d_fwd_function.cc | 23 +- paddle/fluid/eager/api/utils/global_utils.h | 17 ++ .../generator/eager_gen.py | 198 ++++++++++-------- paddle/fluid/eager/eager_layout_auto_tune.h | 198 ++++++++++-------- paddle/fluid/eager/eager_layout_transformer.h | 185 ++++++++-------- paddle/fluid/imperative/layout_autotune.cc | 36 ++-- paddle/fluid/imperative/layout_autotune.h | 28 ++- paddle/fluid/imperative/layout_transformer.h | 20 +- paddle/fluid/imperative/tracer.cc | 2 + paddle/fluid/imperative/tracer.h | 18 +- paddle/fluid/pybind/pybind.cc | 17 +- paddle/phi/api/lib/data_transform.cc | 16 +- .../tests/unittests/test_layout_autotune.py | 7 + python/paddle/nn/functional/conv.py | 4 +- 14 files changed, 443 insertions(+), 326 deletions(-) diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc index 5e221d3f07f6b..d5f15883e0e19 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -51,17 +51,17 @@ paddle::experimental::Tensor conv2d_ad_func( auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector); - auto NEW_input = + auto new_input = egr::EagerAmpAutoCast("input", input, amp_dst_dtype, op_name); - auto NEW_filter = + auto new_filter = egr::EagerAmpAutoCast("filter", filter, amp_dst_dtype, op_name); { paddle::imperative::AutoCastGuard guard( egr::Controller::Instance().GetCurrentTracer(), paddle::imperative::AmpLevel::O0); - return conv2d_ad_func(NEW_input, - NEW_filter, + return conv2d_ad_func(new_input, + new_filter, strides, paddings, paddding_algorithm, @@ -76,7 +76,7 @@ paddle::experimental::Tensor conv2d_ad_func( // Layout autotune - if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) { + if (egr::Controller::Instance().UseLayoutAutoTune()) { VLOG(5) << "Check and Prepare For LAYOUT"; paddle::small_vector, egr::kSlotSmallVectorSize> @@ -85,11 +85,10 @@ paddle::experimental::Tensor conv2d_ad_func( auto op_name = phi::TransToFluidOpName("conv2d"); auto transformer = egr::EagerLayoutAutotune( op_name, tensors_vector, &data_format); - auto NEW_input = transformer->TransInTensor("input", input); - bool is_enable_tune = - paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune(); - paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); - auto out = conv2d_ad_func(NEW_input, + auto new_input = transformer->TransInTensor("input", input); + bool need_tune = egr::Controller::Instance().UseLayoutAutoTune(); + egr::Controller::Instance().DisableLayoutAutoTune(); + auto out = conv2d_ad_func(new_input, filter, strides, paddings, @@ -101,8 +100,8 @@ paddle::experimental::Tensor conv2d_ad_func( workspace_size_MB, exhaustive_search); transformer->SetOutTensorLayout(&out); - if (is_enable_tune) { - paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune(); + if (need_tune) { + egr::Controller::Instance().EnableLayoutAutoTune(); } // Returns return out; diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 93149feeae311..7defffa18e0f7 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -55,6 +55,23 @@ class Controller { paddle::imperative::AmpLevel GetAMPLevel() const { return tracer_->GetAmpLevel(); } + + bool UseLayoutAutoTune() { + bool use_autotune = false; +#if defined(PADDLE_WITH_CUDA) + auto place = tracer_->ExpectedPlace(); + bool is_gpu_place = paddle::platform::is_gpu_place(place); + if (is_gpu_place) { + use_autotune = tracer_->UseLayoutAutoTune(); + } +#endif + return use_autotune; + } + + void DisableLayoutAutoTune() { tracer_->DisableLayoutAutoTune(); } + + void EnableLayoutAutoTune() { tracer_->EnableLayoutAutoTune(); } + bool HasGrad() const { return tracer_->HasGrad(); } void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); } std::string GenerateUniqueName(std::string key = "eager_in_tmp") { diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py index d6375b5aff306..0f51495fc6bea 100644 --- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py @@ -421,15 +421,14 @@ class {} : public egr::GradNodeBase {{ """ LAYOUT_LOGIC_TEMPLATE=\ """ - if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{ - VLOG(5) << "Check and Prepare For LAYOUT"; + if (egr::Controller::Instance().UseLayoutAutoTune()) {{ paddle::small_vector, egr::kSlotSmallVectorSize> tensors_vector = {}; {} {} - paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); + VLOG(5) << "Check and Prepare For LAYOUT "<< op_name; + paddle::imperative::LayoutAutotuneGuard guard(egr::Controller::Instance().GetCurrentTracer(), false); {} {} - paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune(); // Returns return {}; }} @@ -906,6 +905,7 @@ def GenerateNodeCreationCodes(self, for_backward=False): set_grad_in_meta = f"{indent}grad_node->SetGradInMeta({name}, {pos});" set_retain_grad = f"{indent}egr::EagerUtils::CheckAndRetainGrad({name});" + set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) set_grad_in_meta_list.append(set_grad_in_meta) @@ -998,6 +998,98 @@ def __init__(self, forward_api_contents, grad_api_contents, self.forward_definition_str = "" self.forward_declaration_str = "" + def GenerateForwardLayoutAutotune(self, forward_api_name, + amp_tensors_vector_list, + layout_tensors_vector_optional_list, + layout_autotune_list_str, + returns_type_str, returns_str, + amp_inputs_call_args_str): + intermediate_outputs = self.intermediate_outputs + forward_attrs_list = self.forward_attrs_list + forward_outputs_position_map = self.forward_outputs_position_map + num_outputs = len( + forward_outputs_position_map.keys()) - len(intermediate_outputs) + # for layout autotune attr + lightly_sensitive_attr = [ + 'axis', 'axes', 'dim', 'dims', 'start', 'end', 'stop' + ] + heavily_sensitive_attr = ['data_format', 'data_layout'] + layout_autotune_attr = [] + layout_autotune_attr_code_list = [] + layout_autotune_attr_type_list = [] + layout_autotune_attr_code_list.append( + f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");\n" + ) + + lightly_flag = False + heavily_flag = False + for name, atype, default_val, pos in forward_attrs_list: + for attr_name in lightly_sensitive_attr: + if name.find(attr_name) != -1 and (name + not in layout_autotune_attr): + lightly_flag = True + layout_autotune_attr.append(name) + layout_autotune_attr_type_list.append(atype) + if lightly_flag is False: + for attr_name in heavily_sensitive_attr: + if name.find(attr_name) != -1 and ( + name not in layout_autotune_attr): + layout_autotune_attr.append(name) + layout_autotune_attr_type_list.append(atype) + heavily_flag = True + if len(layout_autotune_attr) == 0: + layout_autotune_attr_code_list.append( + f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector);\n" + ) + elif len(layout_autotune_attr) == 1: + layout_autotune_attr_code_list.append( + f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector, &{layout_autotune_attr[0]});\n" + ) + elif len(layout_autotune_attr) == 2: + layout_autotune_attr_code_list.append( + f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}, {layout_autotune_attr_type_list[1]}>(op_name, tensors_vector, &{layout_autotune_attr[0]}, &{layout_autotune_attr[1]});\n" + ) + else: + layout_autotune_attr_code_list.append( + f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector,&{layout_autotune_attr[0]});\n" + ) + # Out tensor + layout_inputs_call_args_str = amp_inputs_call_args_str + forward_function_name = GetDygraphForwardFunctionName(forward_api_name) + layout_tmp_result_list = [] + layout_autotune_outs_list = [] + result_name = "api_result" + if num_outputs == 1: + result_name = returns_str + layout_autotune_outs_list.append( + f"transformer -> SetOutTensorLayout(&{returns_str});\n") + else: + for name, (rtype, pos) in forward_outputs_position_map.items(): + if name in intermediate_outputs: + continue + layout_autotune_outs_list.append( + f" auto& {name} = std::get<{len(layout_tmp_result_list)}>(api_result);\n" + ) + layout_autotune_outs_list.append( + f" transformer -> SetOutTensorLayout(&{name});\n") + layout_tmp_result_list.append(f"{name}") + + tensors_vector_list_str = "{ " + ",".join( + amp_tensors_vector_list) + " }" + + if len(amp_tensors_vector_list) == 0: + layout_logic_str = "" + else: + after_call_str = f"{returns_type_str} {result_name} = {forward_function_name}({layout_inputs_call_args_str});\n" + layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format( + tensors_vector_list_str, + " ".join(layout_tensors_vector_optional_list), + " ".join(layout_autotune_attr_code_list) + " " + + layout_autotune_list_str, after_call_str, + " ".join(layout_autotune_outs_list), returns_str) + + return layout_logic_str + def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): namespace = self.namespace if self.forward_api_name[-1] == '_' and not is_inplaced: @@ -1033,7 +1125,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): layout_tensors_vector_optional_list = [] for name, (ttype, pos) in forward_inputs_position_map.items(): inputs_call_list[pos] = f"{name}" - amp_inputs_call_list[pos] = f"NEW_{name}" + amp_inputs_call_list[pos] = f"new_{name}" is_optional = (name in optional_inputs) if IsPlainTensorType(ttype): if is_optional: @@ -1046,13 +1138,13 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): f"if ({name}) amp_tensors_vector.push_back({{ *{name} }});\n" ) amp_autocast_optional_list.append( - f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" ) layout_tensors_vector_optional_list.append( f"if ({name}) tensors_vector.push_back({{ *{name} }});\n" ) layout_autotune_optional_list.append( - f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n" + f"auto new_{name} = transformer->TransInTensor(\"{name}\", {name});\n" ) else: if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys( @@ -1060,16 +1152,16 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): arg_str = f"paddle::experimental::Tensor& {name}" amp_tensors_vector_list.append(f"{{{name}}}") amp_autocast_list.append( - f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" ) else: arg_str = f"const paddle::experimental::Tensor& {name}" amp_tensors_vector_list.append(f"{{{name}}}") amp_autocast_list.append( - f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" ) layout_autotune_list.append( - f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n" + f"auto new_{name} = transformer->TransInTensor(\"{name}\", {name});\n" ) else: assert IsVectorTensorType(ttype) @@ -1083,10 +1175,10 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): f"if ({name}) amp_tensors_vector.push_back( *{name} );\n" ) amp_autocast_optional_list.append( - f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + f"auto new_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n" ) layout_autotune_optional_list.append( - f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n" + f"auto new_{name} = transformer->TransInTensors(\"{name}\", {name});\n" ) else: if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys( @@ -1096,60 +1188,15 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): arg_str = f"const std::vector& {name}" amp_tensors_vector_list.append(f"{name}") amp_autocast_list.append( - f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + f"auto new_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n" ) layout_autotune_list.append( - f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n" + f"auto new_{name} = transformer->TransInTensors(\"{name}\", {name});\n" ) inputs_args_definition_list[pos] = arg_str inputs_args_declaration_list[pos] = arg_str - # for layout autotune attr - lightly_sensitive_attr = [ - 'axis', 'axes', 'dim', 'dims', 'start', 'end', 'stop' - ] - heavily_sensitive_attr = ['data_format', 'data_layout'] - layout_autotune_attr = [] - layout_autotune_attr_code_list = [] - layout_autotune_attr_type_list = [] - layout_autotune_attr_code_list.append( - f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");\n" - ) - - lightly_flag = False - heavily_flag = False - for name, atype, default_val, pos in forward_attrs_list: - for attr_name in lightly_sensitive_attr: - if name.find( - attr_name) != -1 and name not in layout_autotune_attr: - lightly_flag = True - layout_autotune_attr.append(name) - layout_autotune_attr_type_list.append(atype) - if lightly_flag is False: - for attr_name in heavily_sensitive_attr: - if name.find(attr_name - ) != -1 and name not in layout_autotune_attr: - layout_autotune_attr.append(name) - layout_autotune_attr_type_list.append(atype) - heavily_flag = True - if len(layout_autotune_attr) == 0: - layout_autotune_attr_code_list.append( - f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector);\n" - ) - elif len(layout_autotune_attr) == 1: - layout_autotune_attr_code_list.append( - f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector, &{layout_autotune_attr[0]});\n" - ) - elif len(layout_autotune_attr) == 2: - layout_autotune_attr_code_list.append( - f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}, {layout_autotune_attr_type_list[1]}>(op_name, tensors_vector, &{layout_autotune_attr[0]}, &{layout_autotune_attr[1]});\n" - ) - else: - layout_autotune_attr_code_list.append( - f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector, {len(layout_autotune_attr)});\n" - ) - # forward attrs for name, atype, default_val, pos in forward_attrs_list: inputs_call_list[pos] = name @@ -1339,33 +1386,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): amp_autocast_list_str, amp_call_str) # Forward layout autotune - layout_inputs_call_args_str = amp_inputs_call_args_str - layout_tmp_result_list = [] - layout_autotune_outs_list = "" - if num_outputs == 1: - layout_autotune_outs_list += f"{indent}auto {returns_str} = api_result;\n" - layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{returns_str});\n" - else: - for name, (rtype, pos) in forward_outputs_position_map.items(): - if name in intermediate_outputs: - continue - layout_autotune_outs_list += f"{indent}auto& {name} = std::get<{len(layout_tmp_result_list)}>(api_result);\n" - layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{name});\n" - layout_tmp_result_list.append(f"{name}") - - if returns_type_str == "paddle::experimental::Tensor&" or forward_api_name == "slice" or forward_api_name == "strided_slice" or len( - layout_autotune_attr) == 0: - layout_logic_str = "" - else: - # after_call_str = f"return {forward_ad_function_name}({layout_inputs_call_args_str});\n" - after_call_str = f"auto api_result = {forward_ad_function_name}({layout_inputs_call_args_str});\n" - layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format( - amp_tensors_vector_list_str, - " ".join(layout_tensors_vector_optional_list), - " ".join(layout_autotune_attr_code_list) + " " + - " ".join(layout_autotune_list) + - " ".join(layout_autotune_optional_list), after_call_str, - layout_autotune_outs_list, returns_str) + layout_autotune_list_str = " ".join( + layout_autotune_list) + " ".join(layout_autotune_optional_list) + layout_logic_str = self.GenerateForwardLayoutAutotune( + forward_api_name, amp_tensors_vector_list, + layout_tensors_vector_optional_list, layout_autotune_list_str, + returns_type_str, returns_str, amp_inputs_call_args_str) # For inputs outputs prepare for logging var_str = f"\n{indent} std::string input_str = \"\";" diff --git a/paddle/fluid/eager/eager_layout_auto_tune.h b/paddle/fluid/eager/eager_layout_auto_tune.h index eebdd9caa6d5c..5670275e2b7dd 100644 --- a/paddle/fluid/eager/eager_layout_auto_tune.h +++ b/paddle/fluid/eager/eager_layout_auto_tune.h @@ -19,20 +19,65 @@ #include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/phi/backends/gpu/gpu_info.h" namespace egr { - -// layout_agnostic_ops_ -// For agnostic op like add / relu -inline std::shared_ptr EagerLayoutAutotune( +inline bool NeedTransLayout( + const paddle::small_vector, + kSlotSmallVectorSize>& tensors_vector, + const paddle::experimental::DataLayout& layout) { + for (size_t i = 0; i < tensors_vector.size(); i++) { + for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) { + if (layout != tensors_vector[i][idx].layout()) { + return true; + } + } + } + return false; +} +inline std::shared_ptr BaseTransformer( const std::string& op_name, const paddle::small_vector, kSlotSmallVectorSize>& tensors_vector) { - VLOG(3) << " Optimze Layout agnostic op: " << op_name; std::shared_ptr transposer = nullptr; - transposer = - std::make_shared(op_name, tensors_vector); + bool unstart = + (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == + paddle::experimental::DataLayout::UNDEFINED); + auto first_layout = tensors_vector[0][0].layout(); + VLOG(3) << "Layout autotune was is start ? " << (!unstart) << op_name + << "'s layout is " << first_layout; + + transposer = std::make_shared( + op_name, tensors_vector, first_layout); return transposer; } +// For agnostic op like add, relu, exp +inline std::shared_ptr EagerLayoutAutotune( + const std::string& op_name, + const paddle::small_vector, + kSlotSmallVectorSize>& tensors_vector) { + auto desired_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + auto first_layout = tensors_vector[0][0].layout(); + if (NeedTransLayout(tensors_vector, first_layout)) { + bool need_trans_back = false; + for (size_t i = 0; i < tensors_vector.size(); i++) { + for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) { + if (4 != tensors_vector[i][idx].shape().size()) { + need_trans_back = true; + VLOG(3) << "Agnostic op " << op_name << " shape is " + << tensors_vector[i][idx].shape().size() << " and layout is " + << tensors_vector[i][idx].layout(); + } + } + } + auto final_layout = need_trans_back ? default_layout : desired_layout; + return std::make_shared( + op_name, tensors_vector, final_layout); + } + return BaseTransformer(op_name, tensors_vector); +} + // For lightly op like reduce template inline std::shared_ptr EagerLayoutAutotune( @@ -40,16 +85,11 @@ inline std::shared_ptr EagerLayoutAutotune( const paddle::small_vector, kSlotSmallVectorSize>& tensors_vector, T* attr) { + VLOG(3) << "Lightly op " << op_name << "'s shape is " + << tensors_vector[0][0].shape().size() << " and layout is " + << tensors_vector[0][0].layout(); + std::shared_ptr transposer = nullptr; - bool unstart = - (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == - paddle::experimental::DataLayout::UNDEFINED); - if (unstart) { - VLOG(3) << "Optimze Layout was not started" << op_name; - transposer = - std::make_shared(op_name, tensors_vector); - return transposer; - } transposer = std::make_shared(op_name); return transposer; @@ -63,33 +103,30 @@ inline std::shared_ptr EagerLayoutAutotune( kSlotSmallVectorSize>& tensors_vector, T1* axis, T2* keep_dim) { + VLOG(3) << "Lightly op " << op_name << "'s shape is " + << tensors_vector[0][0].shape().size() << " and layout is " + << tensors_vector[0][0].layout(); + return EagerLayoutAutotune(op_name, tensors_vector, axis); } -// heavily string data_format data_layout +// heavily string data_format, data_layout template <> inline std::shared_ptr EagerLayoutAutotune( const std::string& op_name, const paddle::small_vector, kSlotSmallVectorSize>& tensors_vector, std::string* attr) { - VLOG(3) << " Optimze Layout heavily op: " << op_name; - auto transposer = - std::make_shared(op_name, tensors_vector); + auto first_layout = tensors_vector[0][0].layout(); + auto transposer = std::make_shared( + op_name, tensors_vector, first_layout); if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) { // Layout autotune only supports model with convolutional layers - VLOG(3) << "Optimze Layout was not started" << op_name; + VLOG(3) << "Optimze Layout was not started " << op_name; if (op_name != "conv2d") { return transposer; } else { -#if defined(PADDLE_WITH_CUDA) - if (paddle::platform::is_gpu_place(tensors_vector[0][0].place()) && - !phi::backends::gpu::TensorCoreAvailable()) { - paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); - return transposer; - } -#endif auto data_type = tensors_vector[0][0].dtype(); bool is_tune_fp32 = (data_type == paddle::experimental::DataType::FLOAT32) && @@ -97,6 +134,7 @@ inline std::shared_ptr EagerLayoutAutotune( bool is_tune_fp16 = (data_type == paddle::experimental::DataType::FLOAT16) && (*attr == "NCHW"); + VLOG(3) << "Conv2d_dy's dtype " << data_type << " format" << (*attr); if (is_tune_fp32) { paddle::imperative::LayoutAutoTune::Instance().SetDesiredLayout( paddle::experimental::DataLayout::NCHW); @@ -109,26 +147,27 @@ inline std::shared_ptr EagerLayoutAutotune( paddle::imperative::LayoutAutoTune::Instance().SetDefaultLayout( paddle::experimental::DataLayout::NCHW); } else { - paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); + egr::Controller::Instance().DisableLayoutAutoTune(); return transposer; } - VLOG(3) << "Tune the layout from " << attr << " to " - << paddle::framework::DataLayoutToString( - paddle::imperative::LayoutAutoTune::Instance() - .GetDesiredLayout()); + VLOG(3) + << "Tune the layout from " << *attr << " to " + << paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); } } if (paddle::imperative::LayoutAutoTune::Instance().IsHeavilyLayoutSensitive( op_name)) { + VLOG(3) + << op_name + << "'s LayoutTransformer is EagerHeavilyLayoutSensitiveOpTransformer"; auto heavily_transposer = std::make_shared(op_name, attr); return heavily_transposer; } - VLOG(3) << op_name - << "'s LayoutTransformer is unimplemented. Use default " - "LayoutTransformer instead."; + + VLOG(3) << op_name << "'s LayoutTransformer is unimplemented. Use default."; return transposer; } @@ -139,24 +178,23 @@ inline std::shared_ptr EagerLayoutAutotune( const paddle::small_vector, kSlotSmallVectorSize>& tensors_vector, std::vector* attr) { + auto first_layout = tensors_vector[0][0].layout(); std::shared_ptr transposer = nullptr; if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) { - VLOG(3) << " Optimze Layout Unstarted : " << op_name; - transposer = - std::make_shared(op_name, tensors_vector); + VLOG(3) << "Optimze Layout was not started" << op_name; + transposer = std::make_shared( + op_name, tensors_vector, first_layout); return transposer; } - VLOG(3) << " Optimze Layout lightly op: " << op_name; - if (op_name == "transpose2") { + if (op_name == "transpose2" && + (tensors_vector[0][0].layout() == + paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout())) { auto trans = std::make_shared(op_name); - if (tensors_vector[0][0].layout() == - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout()) { - trans->SetAttr(attr, - tensors_vector[0][0].layout() == - paddle::experimental::DataLayout::NHWC); - return trans; - } + trans->SetAttr(attr, + tensors_vector[0][0].layout() == + paddle::experimental::DataLayout::NHWC); + return trans; } transposer = std::make_shared(op_name); @@ -172,33 +210,32 @@ EagerLayoutAutotune( kSlotSmallVectorSize>& tensors_vector, paddle::experimental::Scalar* axis, bool* keep_dim) { + auto first_layout = tensors_vector[0][0].layout(); std::shared_ptr transposer = nullptr; if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) { - VLOG(3) << " Optimze Layout Unstarted : " << op_name; - transposer = - std::make_shared(op_name, tensors_vector); + VLOG(3) << "Optimze Layout was not started" << op_name; + transposer = std::make_shared( + op_name, tensors_vector, first_layout); return transposer; } auto desired_layout = paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - if (op_name == "argmax") { + if (op_name == "argmax" && + (tensors_vector[0][0].layout() == desired_layout) && (*keep_dim)) { std::shared_ptr argmax_transform = nullptr; argmax_transform = std::make_shared(op_name); - if ((tensors_vector[0][0].layout() == desired_layout) && (*keep_dim)) { - argmax_transform->SetAttr(axis, - tensors_vector[0][0].layout() == - paddle::experimental::DataLayout::NHWC); - return argmax_transform; - } + argmax_transform->SetAttr(axis, + tensors_vector[0][0].layout() == + paddle::experimental::DataLayout::NHWC); + return argmax_transform; } - VLOG(3) << " Optimze Layout lightly op: " << op_name; transposer = std::make_shared(op_name); return transposer; } -// lightly int flatten +// lightly for flatten template <> inline std::shared_ptr EagerLayoutAutotune( const std::string& op_name, @@ -206,17 +243,17 @@ inline std::shared_ptr EagerLayoutAutotune( kSlotSmallVectorSize>& tensors_vector, int* start_axis, int* stop_axis) { + auto first_layout = tensors_vector[0][0].layout(); std::shared_ptr transposer = nullptr; - if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() == - paddle::experimental::DataLayout::UNDEFINED) { - VLOG(3) << " Optimze Layout Unstarted : " << op_name; - transposer = - std::make_shared(op_name, tensors_vector); + auto desired_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + if (desired_layout == paddle::experimental::DataLayout::UNDEFINED) { + VLOG(3) << "Optimze Layout was not started" << op_name; + transposer = std::make_shared( + op_name, tensors_vector, first_layout); return transposer; } - bool no_tranpose = - tensors_vector[0][0].layout() == - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + bool no_tranpose = tensors_vector[0][0].layout() == desired_layout; bool is_valid = ((*start_axis) == 1 && (*stop_axis) == 3); if (op_name == "flatten" || op_name == "flatten_contiguous_range") { if (no_tranpose && is_valid) { @@ -226,15 +263,13 @@ inline std::shared_ptr EagerLayoutAutotune( } } - VLOG(3) << " Optimze Layout lightly op: " << op_name; transposer = std::make_shared(op_name); return transposer; } // lightly int Concat -// lightly T can be int vector vector IntArray -template <> // default int +template <> inline std::shared_ptr EagerLayoutAutotune( const std::string& op_name, @@ -243,30 +278,21 @@ EagerLayoutAutotune( paddle::experimental::Scalar* axis) { auto desired_layout = paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + auto first_layout = tensors_vector[0][0].layout(); std::shared_ptr transposer = nullptr; if (desired_layout == paddle::experimental::DataLayout::UNDEFINED) { - VLOG(3) << " Optimze Layout Unstarted : " << op_name; - transposer = - std::make_shared(op_name, tensors_vector); + VLOG(3) << "Optimze Layout was not started" << op_name; + transposer = std::make_shared( + op_name, tensors_vector, first_layout); return transposer; } - bool need_transpose = false; - for (size_t i = 0; i < tensors_vector.size(); i++) { - for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) { - if (desired_layout != tensors_vector[i][idx].layout()) { - need_transpose = true; - } - } - } - - if (need_transpose) { - VLOG(3) << "Concat need transpose to NCHW " << op_name; + if (NeedTransLayout(tensors_vector, desired_layout)) { + VLOG(3) << op_name << " need transpose to default layout"; transposer = std::make_shared(op_name); return transposer; } else { - VLOG(3) << " Optimze Layout lightly op: " << op_name; auto trans = std::make_shared(op_name); trans->SetAttr(axis, desired_layout); return trans; diff --git a/paddle/fluid/eager/eager_layout_transformer.h b/paddle/fluid/eager/eager_layout_transformer.h index d0cb9c481243b..80398973c4fab 100644 --- a/paddle/fluid/eager/eager_layout_transformer.h +++ b/paddle/fluid/eager/eager_layout_transformer.h @@ -22,9 +22,9 @@ namespace egr { inline paddle::experimental::Tensor EagerTraceTransposeOp( const paddle::experimental::DataLayout layout, const paddle::experimental::Tensor& in) { + VLOG(4) << "AutoTune Transpose from " << in.layout() << " to " << layout + << ", tensor's shape is " << in.shape().size(); if (in.shape().size() != 4) { - VLOG(4) << "Shape is " << in.shape().size() << " can't transpose to" - << paddle::framework::DataLayoutToString(layout); return in; } std::vector axis; @@ -44,77 +44,75 @@ inline paddle::experimental::Tensor EagerTraceTransposeOp( // agnostic op class EagerLayoutTransformer { + using Layout = paddle::experimental::DataLayout; + public: - EagerLayoutTransformer() : op_name_("") {} - explicit EagerLayoutTransformer( - const std::string& op_name, - const paddle::small_vector, - kSlotSmallVectorSize>& tensors_vector) - : op_name_(op_name) { - final_layout_ = "UNDEFINED"; - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - for (size_t i = 0; i < tensors_vector.size(); i++) { - for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) { - if (final_layout_ == "UNDEFINED") { - final_layout_ = paddle::framework::DataLayoutToString( - tensors_vector[0][0].layout()); - } else if (tensors_vector[i][idx].layout() == desired_layout) { - final_layout_ = paddle::framework::DataLayoutToString(desired_layout); - break; - } - } - } - VLOG(4) << op_name_ << "final_layout_ is " << final_layout_; - } + EagerLayoutTransformer() : op_name_(""), final_layout_(Layout::UNDEFINED) {} EagerLayoutTransformer(const EagerLayoutTransformer&) = delete; EagerLayoutTransformer& operator=(const EagerLayoutTransformer&) = delete; + explicit EagerLayoutTransformer( + const std::string& op_name, + const paddle::small_vector, + kSlotSmallVectorSize>& tensors_vector, + const Layout final_layout = Layout::UNDEFINED) + : op_name_(op_name), final_layout_(final_layout) { + VLOG(4) << "Agnostic op : " << op_name_ << " final_layout_ is " + << final_layout_; + } + virtual ~EagerLayoutTransformer() {} - virtual paddle::optional TransInTensor( - const std::string& in_name, - const paddle::optional& in) { - VLOG(4) << op_name_ << "is is agnostic, final_layout_ is " << final_layout_; - return in; + virtual paddle::experimental::Tensor TransInTensor( + const std::string& in_name, const paddle::experimental::Tensor& in) { + if (final_layout_ == Layout::UNDEFINED || final_layout_ == in.layout()) { + VLOG(4) << "EagerLayoutTransformer with no trans"; + return in; + } else { // from NCHW to NHWC + VLOG(4) << "EagerLayoutTransformer with trans from " << in.layout() + << " to " << final_layout_; + auto out_tensor = EagerTraceTransposeOp(final_layout_, in); + phi::DenseTensorUtils::GetMutableMeta( + static_cast(out_tensor.impl().get())) + ->layout = final_layout_; + return out_tensor; + } } - virtual paddle::optional> - TransInTensor( + virtual paddle::optional TransInTensor( const std::string& in_name, - const paddle::optional>& in) { - return in; + const paddle::optional& in) { + return in ? TransInTensor(in_name, *in) : in; } - virtual std::vector TransInTensor( + virtual std::vector TransInTensors( const std::string& in_name, const std::vector& in) { + VLOG(4) << " TransInTensor"; return in; } - virtual paddle::experimental::Tensor TransInTensor( - const std::string& in_name, const paddle::experimental::Tensor& in) { + virtual paddle::optional> + TransInTensors( + const std::string& in_name, + const paddle::optional>& in) { + VLOG(4) << " TransInTensor"; + if (in) { + return TransInTensors(in_name, *in); + } return in; } - virtual void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) { - bool use_default = (final_layout_ == "Undefined(AnyLayout)" || - final_layout_ == ("UNDEFINED")); - auto layout = paddle::framework::StringToDataLayout(final_layout_); - if (!use_default) { - phi::DenseTensorUtils::GetMutableMeta( - static_cast(out_tensor->impl().get())) - ->layout = layout; - } - VLOG(4) << op_name_ << "is is agnostic, use_default " << use_default; + virtual void SetOutTensorLayout( + paddle::optional* out_tensor) { + VLOG(4) << "optional out_tensor"; } virtual void SetOutTensorLayout( std::vector* out_tensor) { - bool use_default = (final_layout_ == "Undefined(AnyLayout)" || - final_layout_ == ("UNDEFINED")); + bool use_default = (final_layout_ == Layout::UNDEFINED); if (!use_default) { for (size_t i = 0; i < out_tensor->size(); i++) { phi::DenseTensorUtils::GetMutableMeta( @@ -126,9 +124,24 @@ class EagerLayoutTransformer { VLOG(4) << op_name_ << "is is agnostic, use_default " << use_default; } + virtual void SetOutTensorLayout( + paddle::optional>* out_tensor) { + VLOG(4) << "optional out_tensor"; + } + + virtual void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) { + bool use_default = final_layout_ == Layout::UNDEFINED; + if (!use_default) { + phi::DenseTensorUtils::GetMutableMeta( + static_cast(out_tensor->impl().get())) + ->layout = final_layout_; + } + VLOG(4) << op_name_ << "is is agnostic, use_default " << use_default; + } + protected: std::string op_name_; - std::string final_layout_; + const Layout final_layout_; }; class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer { @@ -145,21 +158,6 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer { } } - virtual paddle::optional> - TransInTensor( - const std::string& in_name, - const paddle::optional>& in) { - VLOG(4) << op_name_ << "is is heavily"; - return in; - } - - virtual paddle::optional TransInTensor( - const std::string& in_name, - const paddle::optional& in) { - VLOG(4) << op_name_ << "is is heavily"; - return in; - } - paddle::experimental::Tensor TransInTensor( const std::string& in_name, const paddle::experimental::Tensor& in) { if (heavily_input_.count(in_name) != 0 && in.layout() != desired_layout_) { @@ -230,7 +228,6 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer { paddle::framework::DataLayoutToString(in.layout()); auto default_layout = paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); - if (final_layout_ == input_layout && in.shape().size() == 4) { VLOG(4) << op_name_ << "'s " << in_name << " need transpose from " << input_layout << " to default_layout"; @@ -245,7 +242,7 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer { return in; } - virtual std::vector TransInTensor( + virtual std::vector TransInTensors( const std::string& in_name, const std::vector& in) { std::vector result; @@ -340,22 +337,19 @@ class EagerTransposeOpTransformer paddle::experimental::Tensor TransInTensor( const std::string& in_name, const paddle::experimental::Tensor& in) { - VLOG(4) << "with no transpose: EagerTransposeOpTransformer " << in_name - << "'s layout is " - << paddle::framework::DataLayoutToString(in.layout()); return in; } void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) { - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - if (out_tensor->layout() != desired_layout) { + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + if (out_tensor->layout() != default_layout) { VLOG(4) << " Set Out_tensor's layout from " << paddle::framework::DataLayoutToString(out_tensor->layout()) - << " to " << final_layout_; + << " to " << default_layout; phi::DenseTensorUtils::GetMutableMeta( static_cast(out_tensor->impl().get())) - ->layout = desired_layout; + ->layout = default_layout; } } @@ -385,15 +379,15 @@ class EagerArgmaxOpTransformer void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) { VLOG(4) << "EagerArgmaxOpTransformer's out layout is" << paddle::framework::DataLayoutToString(out_tensor->layout()); - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - if (desired_layout != out_tensor->layout()) { + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + if (default_layout != out_tensor->layout()) { VLOG(4) << "Change layout from " << paddle::framework::DataLayoutToString(out_tensor->layout()) - << " to " << final_layout_; + << " to " << default_layout; phi::DenseTensorUtils::GetMutableMeta( static_cast(out_tensor->impl().get())) - ->layout = desired_layout; + ->layout = default_layout; } } @@ -410,11 +404,11 @@ class EagerFlattenOpTransformer explicit EagerFlattenOpTransformer(const std::string& op_name) : op_name_(op_name) { VLOG(3) << "Optimze Layout lightly " << op_name; - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - std::string desired_layout_str = - paddle::framework::DataLayoutToString(desired_layout); - final_layout_ = desired_layout_str; + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + std::string default_layout_str = + paddle::framework::DataLayoutToString(default_layout); + final_layout_ = default_layout_str; } // transpose from NHWC to NCHW @@ -424,16 +418,17 @@ class EagerFlattenOpTransformer } void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) { - VLOG(4) << "EagerArgmaxOpTransformer's out layout is" + VLOG(4) << "EagerFlattenOpTransformer's out layout is" << paddle::framework::DataLayoutToString(out_tensor->layout()); - auto layout = paddle::framework::StringToDataLayout(final_layout_); - if (layout != out_tensor->layout()) { + auto desired_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); + if (desired_layout != out_tensor->layout()) { VLOG(4) << "Change layout from " << paddle::framework::DataLayoutToString(out_tensor->layout()) - << " to " << final_layout_; + << " to " << desired_layout; phi::DenseTensorUtils::GetMutableMeta( static_cast(out_tensor->impl().get())) - ->layout = layout; + ->layout = desired_layout; } } @@ -450,11 +445,11 @@ class EagerConcatOpTransformer explicit EagerConcatOpTransformer(const std::string& op_name) : op_name_(op_name) { VLOG(3) << "Optimze Layout lightly " << op_name; - auto desired_layout = - paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout(); - std::string desired_layout_str = - paddle::framework::DataLayoutToString(desired_layout); - final_layout_ = desired_layout_str; + auto default_layout = + paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout(); + std::string default_layout_str = + paddle::framework::DataLayoutToString(default_layout); + final_layout_ = default_layout_str; } void SetAttr(paddle::experimental::Scalar* axis, @@ -467,7 +462,7 @@ class EagerConcatOpTransformer (*axis) = static_cast(perm[axes]); } - virtual std::vector TransInTensor( + virtual std::vector TransInTensors( const std::string& in_name, const std::vector& in) { return in; diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc index d5a9ba6901087..24d5e1ee896ac 100644 --- a/paddle/fluid/imperative/layout_autotune.cc +++ b/paddle/fluid/imperative/layout_autotune.cc @@ -14,23 +14,15 @@ #include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/layout_transformer.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" - namespace paddle { namespace imperative { -bool LayoutAutoTune::UseLayoutAutoTune() const { -#if defined(PADDLE_WITH_CUDA) - return use_layout_autotune_; -#else - return false; -#endif -} - LayoutAutoTune::LayoutAutoTune() { const auto& op_info = paddle::framework::OpInfoMap::Instance().map(); for (auto it = op_info.begin(); it != op_info.end(); it++) { @@ -140,6 +132,26 @@ paddle::imperative::NameVarMap DealLightlyLayoutSensitive( return transposer->Apply(ins, outs, attrs, tracer); } +LayoutAutotuneGuard::LayoutAutotuneGuard(std::shared_ptr tracer, + bool use_autotune) + : tracer_(tracer) { + pre_layout_autotune_ = tracer_->UseLayoutAutoTune(); + if (pre_layout_autotune_ != use_autotune) { + tracer_->EnableLayoutAutoTune(); + if (!use_autotune) { + tracer_->DisableLayoutAutoTune(); + } + } +} + +LayoutAutotuneGuard::~LayoutAutotuneGuard() { + if (pre_layout_autotune_) { + tracer_->EnableLayoutAutoTune(); + } else { + tracer_->DisableLayoutAutoTune(); + } +} + template paddle::imperative::NameVarMap AutoTuneLayout( const std::string& op_type, @@ -147,7 +159,7 @@ paddle::imperative::NameVarMap AutoTuneLayout( const paddle::imperative::NameVarMap& outs, paddle::framework::AttributeMap* attrs, const std::shared_ptr& tracer) { - if (!LayoutAutoTune::Instance().UseLayoutAutoTune()) { + if (!tracer->UseLayoutAutoTune()) { return ins; } // When layout autotuning is enabled, the tuner will check the desired layout. @@ -165,7 +177,7 @@ paddle::imperative::NameVarMap AutoTuneLayout( } else { #if defined(PADDLE_WITH_CUDA) if (!phi::backends::gpu::TensorCoreAvailable()) { - LayoutAutoTune::Instance().DisableLayoutAutoTune(); + tracer->DisableLayoutAutoTune(); return ins; } #endif @@ -185,7 +197,7 @@ paddle::imperative::NameVarMap AutoTuneLayout( } else if (is_tune_fp16) { LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC); } else { - LayoutAutoTune::Instance().DisableLayoutAutoTune(); + tracer->DisableLayoutAutoTune(); return ins; } VLOG(3) << "Tune the layout from " diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h index af7a89123efe8..0e4c5f1d4e19c 100644 --- a/paddle/fluid/imperative/layout_autotune.h +++ b/paddle/fluid/imperative/layout_autotune.h @@ -19,8 +19,8 @@ #include #include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/imperative/tracer.h" #include "paddle/phi/common/layout.h" - namespace paddle { namespace imperative { @@ -35,12 +35,6 @@ class LayoutAutoTune { return layout_autoTune; } - bool UseLayoutAutoTune() const; - - void EnableLayoutAutoTune() { use_layout_autotune_ = true; } - - void DisableLayoutAutoTune() { use_layout_autotune_ = false; } - bool IsHeavilyLayoutSensitive(const std::string& op_type) const { return heavily_layout_sensitive_ops_.count(op_type) != 0; } @@ -64,8 +58,6 @@ class LayoutAutoTune { private: LayoutAutoTune(); - bool use_layout_autotune_{false}; - std::unordered_set layout_agnostic_ops_{}; std::unordered_set heavily_layout_sensitive_ops_{"batch_norm"}; @@ -73,11 +65,29 @@ class LayoutAutoTune { std::unordered_set lightly_layout_sensitive_ops_{ "instance_norm", "softmax", "transpose", "transpose2", "reshape2"}; + // Best Layout in this platform DataLayout desired_layout_{DataLayout::UNDEFINED}; + // Default Layout in this model DataLayout default_layout_{DataLayout::UNDEFINED}; }; +// LayoutAutotuneGuard is used for RAII. +class LayoutAutotuneGuard { + public: + LayoutAutotuneGuard(std::shared_ptr tracer, bool use_autotune); + + ~LayoutAutotuneGuard(); + + // forbid copy and operator= + LayoutAutotuneGuard(const LayoutAutotuneGuard& guard) = delete; + LayoutAutotuneGuard& operator=(const LayoutAutotuneGuard& guard) = delete; + + private: + std::shared_ptr tracer_; + bool pre_layout_autotune_; +}; + template paddle::imperative::NameVarMap AutoTuneLayout( const std::string& op_type, diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h index 3e857c4ec26f2..9689fffb67342 100644 --- a/paddle/fluid/imperative/layout_transformer.h +++ b/paddle/fluid/imperative/layout_transformer.h @@ -19,8 +19,24 @@ #include "paddle/fluid/imperative/var_helper.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" +#include "paddle/phi/core/tensor_utils.h" namespace paddle { namespace imperative { +template +void SetOutDataLayout(std::shared_ptr var, + const paddle::experimental::DataLayout layout) { + if (var != nullptr) { + paddle::imperative::SetDataLayout(var, layout); + // set out_tensor's layout + if (var->MutableVar()->IsInitialized()) { + paddle::framework::Variable* tmp_var = var->MutableVar(); + auto* out = tmp_var->GetMutable(); + phi::DenseTensorUtils::GetMutableMeta( + static_cast(out)) + ->layout = layout; + } + } +} template std::shared_ptr TraceTransposeOp( @@ -118,7 +134,7 @@ class LayoutTransformer { auto out_vars = outs.at(name); for (auto& var : out_vars) { if (var != nullptr) { - paddle::imperative::SetDataLayout(var, layout); + paddle::imperative::SetOutDataLayout(var, layout); } } not_in_out = false; @@ -130,7 +146,7 @@ class LayoutTransformer { for (auto& pair : outs) { for (auto& var : pair.second) { if (var != nullptr) { - paddle::imperative::SetDataLayout(var, layout); + paddle::imperative::SetOutDataLayout(var, layout); } } } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 35eb3e9384200..400c0021d6d7e 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -42,6 +42,8 @@ thread_local bool Tracer::enable_program_desc_tracing_ = false; thread_local bool Tracer::has_grad_ = true; +thread_local bool Tracer::use_layout_autotune_ = false; + thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0; thread_local phi::DataType Tracer::amp_dtype_ = phi::DataType::FLOAT32; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index a9ede4bb25199..9a93d299c002a 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,9 +28,9 @@ #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/layout_autotune.h" #include "paddle/fluid/platform/macros.h" #include "paddle/phi/core/compat/arg_map_context.h" - namespace paddle { namespace imperative { @@ -184,6 +184,20 @@ class Tracer { } } + void DisableLayoutAutoTune() { use_layout_autotune_ = false; } + + void EnableLayoutAutoTune() { use_layout_autotune_ = true; } + + bool UseLayoutAutoTune() { +#if defined(PADDLE_WITH_CUDA) + if (phi::backends::gpu::TensorCoreAvailable()) { + return use_layout_autotune_; + } +#endif + use_layout_autotune_ = false; + return false; + } + phi::KernelSignature GetExpectedKernelSignature( const std::string& type, const NameTensorMap& ins, @@ -199,8 +213,8 @@ class Tracer { std::unique_ptr generator_; platform::Place expected_place_; GarbageCollectorMap gcs_; - static thread_local bool enable_program_desc_tracing_; + static thread_local bool use_layout_autotune_; static thread_local bool has_grad_; static thread_local AmpLevel amp_level_; static thread_local phi::DataType amp_dtype_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1421b823cda55..4883183b383dd 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2502,19 +2502,14 @@ All parameter, weight, gradient are variables in Paddle. return res; }); - m.def("enable_layout_autotune", [] { - return paddle::imperative::LayoutAutoTune::Instance() - .EnableLayoutAutoTune(); - }); + m.def("enable_layout_autotune", + [] { return egr::Controller::Instance().EnableLayoutAutoTune(); }); - m.def("disable_layout_autotune", [] { - return paddle::imperative::LayoutAutoTune::Instance() - .DisableLayoutAutoTune(); - }); + m.def("disable_layout_autotune", + [] { return egr::Controller::Instance().DisableLayoutAutoTune(); }); - m.def("use_layout_autotune", [] { - return paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune(); - }); + m.def("use_layout_autotune", + [] { return egr::Controller::Instance().UseLayoutAutoTune(); }); BindFleetWrapper(&m); BindIO(&m); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 363e1f65d0a8b..23ff797d77c01 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -52,9 +52,9 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input, return ret; } -inline bool NeedTransformLayout(const paddle::platform::Place& place, - const DataLayout& input, +inline bool NeedTransformLayout(const DataLayout& input, const DataLayout& target, + const paddle::platform::Place& place, const TransformFlag& transform_flag) { bool ret = transform_flag.need_trans_layout() && (input != DataLayout::ALL_LAYOUT && @@ -202,9 +202,9 @@ phi::DenseTensor TransformData(phi::DenseTensor* tensor, bool trans_layout = false; bool trans_dtype = false; - if (NeedTransformLayout(tensor->place(), - tensor->layout(), + if (NeedTransformLayout(tensor->layout(), target_args_def.layout, + tensor->place(), transform_flag)) { out = TransDataLayout(out, target_args_def.layout); trans_layout = true; @@ -240,9 +240,9 @@ std::shared_ptr PrepareData( dense_tensor.place(), target_args_def.backend, transform_flag) && !NeedTransformDataType( dense_tensor.dtype(), target_args_def.dtype, transform_flag) && - !NeedTransformLayout(dense_tensor.place(), - dense_tensor.layout(), + !NeedTransformLayout(dense_tensor.layout(), target_args_def.layout, + dense_tensor.place(), transform_flag))) { return std::static_pointer_cast(tensor_in); } @@ -277,9 +277,9 @@ std::unique_ptr> PrepareData( tensor_in->place(), target_args_def.backend, transform_flag) && !NeedTransformDataType( tensor_in->dtype(), target_args_def.dtype, transform_flag) && - !NeedTransformLayout(tensor_in->place(), - tensor_in->layout(), + !NeedTransformLayout(tensor_in->layout(), target_args_def.layout, + tensor_in->place(), transform_flag))) { pt_tensors->emplace_back( *std::dynamic_pointer_cast(tensor_in)); diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py index b502f405bd77a..21d6d97617d05 100644 --- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py @@ -46,6 +46,13 @@ def forward(self, image): class LayoutAutoTune(unittest.TestCase): + def test_config(self): + paddle.fluid.core.enable_layout_autotune() + if self.use_autoune(): + self.assertEqual(paddle.fluid.core.use_layout_autotune(), True) + paddle.fluid.core.disable_layout_autotune() + self.assertEqual(paddle.fluid.core.use_layout_autotune(), False) + def setUp(self): self.use_autoune() diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 06784f5d13c52..90becda690bfe 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -130,15 +130,13 @@ def _conv_nd(x, if bias is not None: channel_dim = channel_dim + len( x.shape) if channel_dim < 0 else channel_dim - if pre_bias.layout == "NHWC": - channel_dim = 3 # last dim if isinstance(x, tuple): x = x[0] if isinstance(bias, tuple): bias = bias[0] if len(bias.shape) < len(x.shape): tmp_bias = _C_ops.reshape( - bias, bias.shape + + bias, [1 for i in range(channel_dim)] + bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)]) return _C_ops.add(pre_bias, tmp_bias) else: