diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index f14023848c6..f73442b6fd5 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -7,6 +7,7 @@ on: branches: [develop, release/**] paths: - "**" + - "Paddle/**" - "!backends/**" - "backends/metax_gpu/**" diff --git a/Paddle b/Paddle index 2588f489910..2171de22496 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 2588f4899106cd27bdfcc84ba4c2f5f7aac570ab +Subproject commit 2171de2249622cbe2c8d7f315257657ad6f30184 diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 475074ced89..e357a5e5912 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -648,6 +648,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/gpu_info.cc # ############################################################################ ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu # kernels/kps @@ -687,6 +688,7 @@ file( RELATIVE ${CMAKE_SOURCE_DIR} runtime/runtime.cc passes/*.cc + common/*.cc kernels/*.cc kernels/*.cu kernels/fusion/*.cc diff --git a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu b/backends/metax_gpu/common/flags_declare.cc similarity index 89% rename from backends/metax_gpu/kernels/metax_kernel/flags_declare.cu rename to backends/metax_gpu/common/flags_declare.cc index d7aefe54e9f..6b497cf9fdf 100644 --- a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu +++ b/backends/metax_gpu/common/flags_declare.cc @@ -80,6 +80,17 @@ PHI_DEFINE_EXPORTED_bool( "faster but it may loss precision in most case. If true, the compute " "type will be set to fp16. Default is false."); +PHI_DEFINE_EXPORTED_string( + selected_gpus, + "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); + PHI_DEFINE_EXPORTED_bool(use_fast_math, false, "Whether to use fast math GPU functions."); diff --git a/backends/metax_gpu/common/utils.cc b/backends/metax_gpu/common/utils.cc new file mode 100644 index 00000000000..58e835687d9 --- /dev/null +++ b/backends/metax_gpu/common/utils.cc @@ -0,0 +1,297 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "common/utils.h" + +#include "glog/logging.h" +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/backends/custom/custom_context.h" + +namespace phi { +namespace { +C_Status AsyncMemCpyH2D(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size) { + if (size == 0) { + return C_SUCCESS; + } + + if (dst == NULL || src == NULL) { + return C_ERROR; + } + cudaError_t cudaErr = cudaSetDevice(device->id); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + + cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + + return C_SUCCESS; +} + +C_Status AsyncMemCpyD2H(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size) { + if (size == 0) { + return C_SUCCESS; + } + + if (dst == NULL || src == NULL) { + return C_ERROR; + } + + cudaError_t cudaErr = cudaSetDevice(device->id); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + + cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + + return C_SUCCESS; +} + +C_Status AsyncMemCpyD2D(const C_Device device, + C_Stream stream, + void* dst, + const void* src, + size_t size) { + if (size == 0) { + VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " " + << size; // NOLINT + return C_SUCCESS; + } + + if (dst == NULL || src == NULL) { + return C_ERROR; + } + + cudaError_t cudaErr = cudaSetDevice(device->id); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + + cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice); + if (cudaErr != cudaSuccess) { + return C_ERROR; + } + VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " " + << size; // NOLINT + return C_SUCCESS; +} + +template +inline void TensorCopy(const Context& dev_ctx, + const phi::DenseTensor& src, + bool blocking, + phi::DenseTensor* dst, + const phi::Place& dst_place = phi::CustomPlace()) { + auto* src_ptr = src.data(); + const auto& src_place = src.place(); + if (src_ptr == nullptr) { + return; + } + auto dst_place_ = dst_place; + if (dst_place_.GetType() != phi::AllocationType::CPU) { + dst_place_ = dev_ctx.GetPlace(); + } + + if (&src == dst) { + if (src_place == dst_place_) { + VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place + << " to " << dst_place_; + } else { + VLOG(6) << "Src and dst are the same Tensor, in-place copy data(" + << src_ptr << ") from " << src_place << " to " << dst_place_; + const phi::DenseTensor src_copy = src; + TensorCopy(dev_ctx, src_copy, blocking, dst, dst_place_); + } + return; + } + + auto dst_dims = dst->dims(); + dst->Resize(src.dims()); + void* dst_ptr = nullptr; + if (dst_place_.GetType() != phi::AllocationType::CPU) { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + } else { + dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } + + PADDLE_ENFORCE_EQ( + dst->place(), + dst_place_, + phi::errors::Unavailable( + "The Dst Tensor's place and dst_place do not match, Tensor's place " + "place is %s, dst_place is %s.", + dst->place(), + dst_place_)); + + if (src_ptr == dst_ptr && src_place == dst_place_) { + if ((dst_dims == src.dims()) || (src_place == phi::CPUPlace())) { + VLOG(3) << "Skip copy the same data async from " << src_ptr << " in " + << src_place << " to " << dst_ptr << " in " << dst_place_; + return; + } else { + // scatter memory + phi::DenseTensor tmp_dst; + tmp_dst.set_meta(dst->meta()); + tmp_dst.Resize(dst_dims); + dst_ptr = dev_ctx.Alloc(&tmp_dst, tmp_dst.dtype()); + *dst = tmp_dst; + } + } + VLOG(4) << "src:" << src_ptr << " place: " << src_place + << " type:" << static_cast(src_place.GetType()) + << ", dst:" << dst_ptr << " place: " << dst_place_ + << " type:" << static_cast(dst_place_.GetType()); + + C_Stream stream = reinterpret_cast(dev_ctx.stream()); + + auto size = + (src.dims().size() != 0 ? src.numel() : 1) * phi::SizeOf(src.dtype()); + if (UNLIKELY(size) == 0) { + return; + } + + if (src_place.GetType() == phi::AllocationType::CPU && + dst_place_.GetType() == phi::AllocationType::CUSTOM) { + VLOG(6) << "TensorCopy from cpu to cus"; + C_Device_st device; + device.id = dst_place_.GetDeviceId(); + AsyncMemCpyH2D(&device, stream, dst_ptr, src_ptr, size); + if (blocking) { + dev_ctx.Wait(); + } + } else if (src_place.GetType() == phi::AllocationType::CUSTOM && + dst_place_.GetType() == phi::AllocationType::CPU) { + VLOG(6) << "TensorCopy from cus to cpu"; + C_Device_st device; + device.id = src_place.GetDeviceId(); + AsyncMemCpyD2H(&device, stream, dst_ptr, src_ptr, size); + if (blocking) { + dev_ctx.Wait(); + } + } else if (src_place.GetType() == phi::AllocationType::CUSTOM && + dst_place_.GetType() == phi::AllocationType::CUSTOM) { + VLOG(6) << "TensorCopy from cus to cus"; + if (src_place.GetDeviceType() == dst_place_.GetDeviceType()) { + if (src_place.GetDeviceId() == dst_place_.GetDeviceId()) { + C_Device_st device; + device.id = src_place.GetDeviceId(); + AsyncMemCpyD2D(&device, stream, dst_ptr, src_ptr, size); + if (blocking) { + dev_ctx.Wait(); + } + } else { + PADDLE_THROW( + phi::errors::Unimplemented("TensorCopy is not supported.")); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented("TensorCopy is not supported.")); + } + } else if (src_place.GetType() == phi::AllocationType::CPU && + dst_place_.GetType() == phi::AllocationType::CPU) { + VLOG(6) << "TensorCopy from cpu to cpu"; + std::memcpy(dst_ptr, src_ptr, size); + } +} + +template +std::ostream& PrintTensor(std::ostream& os, const phi::DenseTensor& tensor) { + phi::DenseTensor cpu_tensor; + if (tensor.place().GetType() != phi::AllocationType::CPU) { + auto dev_ctx = static_cast( + phi::DeviceContextPool::Instance().Get(tensor.place())); + TensorCopy(*dev_ctx, tensor, true, &cpu_tensor, phi::CPUPlace()); + } else { + cpu_tensor = tensor; + } + os << "DenseTensor<"; + if (tensor.initialized()) { + os << phi::DataTypeToString(tensor.dtype()) << ", "; + os << tensor.place() << ", "; + os << "Shape(" << tensor.dims() << "), "; + os << "Strides(" << tensor.strides() << "), "; + os << "layout:" << tensor.layout() << ", "; + os << "data: ["; + + auto ptr = cpu_tensor.data(); + auto element_num = cpu_tensor.numel(); + // Note: int8_t && uint8_t is typedef of char, ostream unable to print + // properly + if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) { + if (element_num > 0) { + os << signed(ptr[0]); + for (int j = 1; j < element_num; ++j) { + os << " " << signed(ptr[j]); + } + } + } else { + if (element_num > 0) { + os << ptr[0]; + for (int j = 1; j < element_num; ++j) { + os << " " << ptr[j]; + } + } + } + os << "]"; + } else { + os << "NOT_INITED"; + } + os << ">"; + return os; +} +} // namespace + +#define FOR_EACH_DATA_TYPE_TO_PRINT(_) \ + _(bool, phi::DataType::BOOL) \ + _(int8_t, phi::DataType::INT8) \ + _(uint8_t, phi::DataType::UINT8) \ + _(int16_t, phi::DataType::INT16) \ + _(uint16_t, phi::DataType::UINT16) \ + _(int32_t, phi::DataType::INT32) \ + _(uint32_t, phi::DataType::UINT32) \ + _(int64_t, phi::DataType::INT64) \ + _(uint64_t, phi::DataType::UINT64) \ + _(phi::bfloat16, phi::DataType::BFLOAT16) \ + _(phi::float16, phi::DataType::FLOAT16) \ + _(float, phi::DataType::FLOAT32) \ + _(double, phi::DataType::FLOAT64) + +#define CALL_PRINT_TENSOR(cpp_type, data_type) \ + case data_type: \ + PrintTensor(os, t); \ + break; + +std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) { + switch (t.dtype()) { + FOR_EACH_DATA_TYPE_TO_PRINT(CALL_PRINT_TENSOR) + default: + VLOG(1) << "PrintTensor unrecognized data type:" << t.dtype(); + } + return os; +} +#undef FOR_EACH_DATA_TYPE_TO_PRINT +#undef CALL_PRINT_TENSOR +} // namespace phi diff --git a/backends/metax_gpu/common/utils.h b/backends/metax_gpu/common/utils.h new file mode 100644 index 00000000000..74e8aa9d788 --- /dev/null +++ b/backends/metax_gpu/common/utils.h @@ -0,0 +1,28 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { +std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t); +}