Skip to content

Commit

Permalink
merge upstream/develop to solve conflict
Browse files Browse the repository at this point in the history
  • Loading branch information
sneaxiy committed Sep 7, 2022
2 parents c86679a + a9cc027 commit ed3cc0a
Show file tree
Hide file tree
Showing 308 changed files with 11,513 additions and 4,069 deletions.
2 changes: 1 addition & 1 deletion cmake/phi.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ function(kernel_declare TARGET_LIST)
"${kernel_impl}")
if(NOT first_registry STREQUAL "")
# some gpu kernel only can run on cuda, not support rocm, so we add this branch
if(WITH_ROCM)
if(WITH_ROCM OR WITH_NV_JETSON)
string(FIND "${first_registry}" "cuda_only" pos)
if(pos GREATER 1)
continue()
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/distributed/collective/ProcessGroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ class ProcessGroup {
int GetSize() const { return size_; }

virtual const std::string GetBackendName() const = 0;
virtual phi::DeviceContext* GetDeviceContext(const Place& place) const {
PADDLE_THROW(platform::errors::InvalidArgument(
"Does not support to get device_context from ProcessGroup%s.",
GetBackendName()));
}

// TODO(liyurui): This API will be moved later
virtual std::shared_ptr<ProcessGroup::Task> AllReduce(
Expand Down
25 changes: 23 additions & 2 deletions paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"

DECLARE_bool(nccl_blocking_wait);
DECLARE_bool(use_stream_safe_cuda_allocator);
Expand Down Expand Up @@ -738,14 +739,23 @@ void* GetPointerByOffset(void* raw_pointer,
} else if (type == experimental::DataType::FLOAT64) {
return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT32) {
return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::INT64) {
return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::FLOAT16) {
return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
} else if (type == experimental::DataType::INT8) {
return reinterpret_cast<void*>(reinterpret_cast<int8_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::UINT8) {
return reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(raw_pointer) +
offset);
} else if (type == experimental::DataType::BOOL) {
return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
offset);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
Expand Down Expand Up @@ -1032,5 +1042,16 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
return iter->second[0]->GetNcclComm();
}

phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
const Place& place) const {
std::vector<Place> places = {place};
const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
PADDLE_ENFORCE_NE(iter,
places_to_ctx_.end(),
platform::errors::InvalidArgument(
"Cannot find device context in process group."));
return iter->second[0].get();
}

} // namespace distributed
} // namespace paddle
2 changes: 2 additions & 0 deletions paddle/fluid/distributed/collective/ProcessGroupNCCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ class ProcessGroupNCCL : public ProcessGroupStream {
return std::string(NCCL_BACKEND_NAME);
}

phi::DeviceContext* GetDeviceContext(const Place& place) const override;

std::shared_ptr<ProcessGroup::Task> AllReduce(
std::vector<phi::DenseTensor>& in_tensors, // NOLINT
std::vector<phi::DenseTensor>& out_tensors, // NOLINT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/eager_amp_auto_cast.h"
#include "paddle/fluid/eager/eager_layout_auto_tune.h"
#include "paddle/fluid/eager/nan_inf_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"

Expand Down Expand Up @@ -73,6 +74,40 @@ paddle::experimental::Tensor conv2d_dygraph_function(
}
}

// Layout autotune

if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {
VLOG(5) << "Check and Prepare For LAYOUT";
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
tensors_vector = {{input}, {filter}};

auto op_name = phi::TransToFluidOpName("conv2d");
auto transformer = egr::EagerLayoutAutotune<std::string>(
op_name, tensors_vector, &data_format);
auto NEW_input = transformer->TransInTensor("input", input);
bool is_enable_tune =
paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune();
paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
auto out = conv2d_dygraph_function(NEW_input,
filter,
strides,
paddings,
paddding_algorithm,
groups,
dilations,
data_format,
use_addto,
workspace_size_MB,
exhaustive_search);
transformer->SetOutTensorLayout(&out);
if (is_enable_tune) {
paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
}
// Returns
return out;
}

// Get Input AutoGradMeta
egr::AutogradMeta* input_autograd_meta =
egr::EagerUtils::nullable_autograd_meta(input);
Expand Down
131 changes: 120 additions & 11 deletions paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ class {} : public egr::GradNodeBase {{
// Dygraph Record Event
{}
// AMP Logic
{}
// Layout autotune
{}
// Get Input AutoGradMeta
{}
Expand Down Expand Up @@ -217,7 +219,8 @@ class {} : public egr::GradNodeBase {{
{}
// AMP Logic
{}
// Layout autotune
{}
// Forward API Call
VLOG(3) << \"Final State Running: \" << \"{}\";
{}
Expand Down Expand Up @@ -295,7 +298,6 @@ class {} : public egr::GradNodeBase {{
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/fluid/eager/nan_inf_utils.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h"
DECLARE_bool(check_nan_inf);
Expand All @@ -317,7 +319,7 @@ class {} : public egr::GradNodeBase {{
#include "paddle/phi/api/lib/dygraph_api.h"
#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
#include "paddle/fluid/eager/eager_layout_auto_tune.h"
#include "paddle/phi/api/include/strings_api.h"
#include "paddle/phi/api/include/sparse_api.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
Expand Down Expand Up @@ -396,7 +398,21 @@ class {} : public egr::GradNodeBase {{
}}
}}
"""

LAYOUT_LOGIC_TEMPLATE=\
"""
if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{
VLOG(5) << "Check and Prepare For LAYOUT";
paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> tensors_vector = {};
{}
{}
paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
{}
{}
paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
// Returns
return {};
}}
"""
CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE = \
"""
paddle::optional<paddle::experimental::Tensor> {}_optional;
Expand Down Expand Up @@ -992,6 +1008,9 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_tensors_vector_optional_list = []
amp_autocast_list = []
amp_autocast_optional_list = []
layout_autotune_list = []
layout_autotune_optional_list = []
layout_tensors_vector_optional_list = []
for name, (ttype, pos) in forward_inputs_position_map.items():
inputs_call_list[pos] = f"{name}"
amp_inputs_call_list[pos] = f"NEW_{name}"
Expand All @@ -1009,6 +1028,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_autocast_optional_list.append(
f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
)
layout_tensors_vector_optional_list.append(
f"if ({name}) tensors_vector.push_back({{ *{name} }});\n"
)
layout_autotune_optional_list.append(
f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
)
else:
if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
):
Expand All @@ -1023,6 +1048,9 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_autocast_list.append(
f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
)
layout_autotune_list.append(
f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
)
else:
assert IsVectorTensorType(ttype)
if is_optional:
Expand All @@ -1037,6 +1065,9 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_autocast_optional_list.append(
f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
)
layout_autotune_optional_list.append(
f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
)
else:
if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
):
Expand All @@ -1047,10 +1078,59 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_autocast_list.append(
f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
)
layout_autotune_list.append(
f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
)

inputs_args_definition_list[pos] = arg_str
inputs_args_declaration_list[pos] = arg_str

# for layout autotune attr
lightly_sensitive_attr = [
'axis', 'axes', 'dim', 'dims', 'start', 'end', 'stop'
]
heavily_sensitive_attr = ['data_format', 'data_layout']
layout_autotune_attr = []
layout_autotune_attr_code_list = []
layout_autotune_attr_type_list = []
layout_autotune_attr_code_list.append(
f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");\n"
)

lightly_flag = False
heavily_flag = False
for name, atype, default_val, pos in forward_attrs_list:
for attr_name in lightly_sensitive_attr:
if name.find(
attr_name) != -1 and name not in layout_autotune_attr:
lightly_flag = True
layout_autotune_attr.append(name)
layout_autotune_attr_type_list.append(atype)
if lightly_flag is False:
for attr_name in heavily_sensitive_attr:
if name.find(attr_name
) != -1 and name not in layout_autotune_attr:
layout_autotune_attr.append(name)
layout_autotune_attr_type_list.append(atype)
heavily_flag = True
if len(layout_autotune_attr) == 0:
layout_autotune_attr_code_list.append(
f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector);\n"
)
elif len(layout_autotune_attr) == 1:
layout_autotune_attr_code_list.append(
f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector, &{layout_autotune_attr[0]});\n"
)
elif len(layout_autotune_attr) == 2:
layout_autotune_attr_code_list.append(
f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}, {layout_autotune_attr_type_list[1]}>(op_name, tensors_vector, &{layout_autotune_attr[0]}, &{layout_autotune_attr[1]});\n"
)
else:
layout_autotune_attr_code_list.append(
f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector, {len(layout_autotune_attr)});\n"
)

# forward attrs
for name, atype, default_val, pos in forward_attrs_list:
inputs_call_list[pos] = name
amp_inputs_call_list[pos] = name
Expand Down Expand Up @@ -1236,24 +1316,53 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str,
amp_autocast_list_str, amp_call_str)

# Forward layout autotune
layout_inputs_call_args_str = amp_inputs_call_args_str
layout_tmp_result_list = []
layout_autotune_outs_list = ""
if num_outputs == 1:
layout_autotune_outs_list += f"{indent}auto {returns_str} = api_result;\n"
layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{returns_str});\n"
else:
for name, (rtype, pos) in forward_outputs_position_map.items():
if name in intermediate_outputs:
continue
layout_autotune_outs_list += f"{indent}auto& {name} = std::get<{len(layout_tmp_result_list)}>(api_result);\n"
layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{name});\n"
layout_tmp_result_list.append(f"{name}")

if returns_type_str == "paddle::experimental::Tensor&" or forward_api_name == "slice" or forward_api_name == "strided_slice" or len(
layout_autotune_attr) == 0:
layout_logic_str = ""
else:
# after_call_str = f"return {forward_function_name}({layout_inputs_call_args_str});\n"
after_call_str = f"auto api_result = {forward_function_name}({layout_inputs_call_args_str});\n"
layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format(
amp_tensors_vector_list_str,
" ".join(layout_tensors_vector_optional_list),
" ".join(layout_autotune_attr_code_list) + " " +
" ".join(layout_autotune_list) +
" ".join(layout_autotune_optional_list), after_call_str,
layout_autotune_outs_list, returns_str)

# Generate forward_definition_str and forward_declaration_str
if self.is_forward_only:
if len(amp_tensors_vector_list) == 0:
amp_logic_str = ""
self.forward_definition_str += FORWARD_ONLY_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name,
inputs_args_definition_str, dygraph_event_str, amp_logic_str,
forward_function_name, forward_call_str, get_outputs_str,
returns_str)
layout_logic_str, forward_function_name, forward_call_str,
get_outputs_str, returns_str)
else:
self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name,
inputs_args_definition_str, dygraph_event_str, amp_logic_str,
inputs_autograd_meta_str, forward_function_name,
forward_call_str, check_nan_inf_str, get_outputs_str,
outputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, bump_inplace_version_str, node_creation_str,
returns_str)
layout_logic_str, inputs_autograd_meta_str,
forward_function_name, forward_call_str, check_nan_inf_str,
get_outputs_str, outputs_autograd_meta_str,
compute_require_grad_args_str, check_inplace_str,
bump_inplace_version_str, node_creation_str, returns_str)

self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"

Expand Down
Loading

0 comments on commit ed3cc0a

Please sign in to comment.