Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Pten] blas and lapck migration #39587

Merged
merged 4 commits into from
Feb 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1036,3 +1036,42 @@ function(generate_dummy_static_lib)
add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
endfunction()

function(math_library TARGET)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set(cc_srcs)
set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()

list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
endfunction()

8 changes: 3 additions & 5 deletions paddle/fluid/distributed/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,16 @@
#include <utility>
#include <vector>

#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace distributed {

template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
inline pten::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
return pten::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
}

template <typename T>
Expand Down
14 changes: 4 additions & 10 deletions paddle/fluid/distributed/ps/service/communicator/communicator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1161,8 +1161,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
blas.VSUB(t_latest.numel(), t_latest.data<float>(),
t_timestamp->data<float>(), t_delta->data<float>());

Expand Down Expand Up @@ -1201,8 +1200,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(),
t_delta->data<float>());
blas.VADD(t_latest->numel(), t_latest->data<float>(),
Expand Down Expand Up @@ -1303,9 +1301,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
t_delta->set_rows(sparse_ids);
t_delta->set_height(t_latest.dims()[0]);

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
float coefficient = 1.0 / static_cast<float>(trainers_);

std::vector<float *> push_g_vec;
Expand Down Expand Up @@ -1371,9 +1367,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
v_delta.resize(numel);

paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);

for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

#include "paddle/fluid/distributed/ps/service/ps_client.h"
Expand Down
10 changes: 5 additions & 5 deletions paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -121,14 +121,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,

// broadcast biases
std::vector<float> ones(m, 1.0f);
paddle::operators::math::CBlas<float>::GEMM(
pten::funcs::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
&combined_biases[0], n, 0.0f, embeddings_data, n);

// Wx*embeddings + biases
paddle::operators::math::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
embedding_data, k, weightx_data, n, beta, embeddings_data, n);
pten::funcs::CBlas<float>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, embedding_data, k,
weightx_data, n, beta, embeddings_data, n);
op_desc.SetInput("Embeddings", {embeddings});

op_desc.SetInput("H0", {});
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/imperative/gradient_accumulator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/refactor/math.h"
Expand Down Expand Up @@ -85,7 +85,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
void operator()(const platform::CPUPlace& place) const {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}

Expand Down Expand Up @@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
#include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace inference {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/activation_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/addmm_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

namespace ops = paddle::operators;
Expand Down Expand Up @@ -94,7 +94,7 @@ class AddMMKernel : public framework::OpKernel<T> {
float alpha = context.template Attr<float>("Alpha");
float beta = context.template Attr<float>("Beta");

auto blas = math::GetBlas<DeviceContext, T>(context);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);

// calc broadcast dim
Array2 bcast_dims;
Expand Down Expand Up @@ -146,7 +146,7 @@ class AddMMGradKernel : public framework::OpKernel<T> {
}

auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dinput) {
dinput->mutable_data<T>(ctx.GetPlace());
total_elems = in_dims[0] * in_dims[1];
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/affine_grid_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

namespace paddle {
Expand Down Expand Up @@ -122,7 +122,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Expand Down Expand Up @@ -165,7 +165,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/atan2_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/attention_lstm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ limitations under the License. */

#include "paddle/fluid/operators/attention_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -373,7 +373,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());

auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);

// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/batch_fc_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -112,7 +112,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim;

auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data,
w_data, beta, out_data, slot_pairs_num, strideA, strideB);
add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num,
Expand Down Expand Up @@ -165,7 +165,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data,
slot_pairs_num, ins_num, out_dim, db_data);

auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1;
T beta = 0;

Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/bilinear_tensor_product_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -61,7 +61,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
auto output_col_vec = output_mat.chip(i, 1);
Tensor weight_mat =
weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
pten::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
weight_mat.data<T>(), 0, left_mul.data<T>());
output_col_vec.device(place) =
Expand Down Expand Up @@ -127,7 +127,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
d_weight->mutable_data<T>(ctx.GetPlace());
}

auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);

// Caculate the Output(X@Grad) and Output(Y@Grad).
if (d_x || d_y || d_weight) {
Expand Down
Loading