diff --git a/log b/log deleted file mode 100644 index c02e10686b5fb..0000000000000 Binary files a/log and /dev/null differ diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index d242c9f8c3fbd..109007d737c15 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -122,8 +122,14 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, ops::ConcatNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatNPUKernel, +#endif ops::ConcatNPUKernel); REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, ops::ConcatGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatGradNPUKernel, +#endif ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 837bca6c2cf4e..709d69214c603 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -536,32 +536,20 @@ class CudnnBNAddReluTester { bn_bias->Resize({1, 1, 1, channels_}); // input - float *sum_ptr = sum->data(); - float *sum_of_square_ptr = sum_of_square->data(); - float *bn_scale_ptr = bn_scale->data(); - float *bn_bias_ptr = bn_bias->data(); - mean->Resize({1, 1, 1, channels_}); var->Resize({1, 1, 1, channels_}); // output - float *mean_ptr = mean->data(); - float *var_ptr = var->data(); - float *saved_mean_ptr = - saved_mean->mutable_data({1, 1, 1, channels_}, place); - float *saved_var_ptr = - saved_var->mutable_data({1, 1, 1, channels_}, place); - T *equiv_scale_ptr = - equiv_scale->mutable_data({1, 1, 1, channels_}, place); - T *equiv_bias_ptr = - equiv_bias->mutable_data({1, 1, 1, channels_}, place); + equiv_scale->Resize({1, 1, 1, channels_}); + equiv_bias->Resize({1, 1, 1, channels_}); + saved_mean->Resize({1, 1, 1, channels_}); + saved_var->Resize({1, 1, 1, channels_}); auto param_shape = framework::vectorize(bn_scale->dims()); op::CudnnBNStatsFinalize bn_op(ctx, param_shape); - bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, - equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, - true); + bn_op.Forward(ctx, *sum, *sum_of_square, *bn_scale, *bn_bias, saved_mean, + saved_var, mean, var, equiv_scale, equiv_bias, eps_, + momentum_, ele_count_, true); } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu @@ -627,21 +615,13 @@ class CudnnBNAddReluTester { &saved_var_z, &equiv_scale_z, &equiv_bias_z); } - T *x_ptr = x.data(); - T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; - T *equiv_scale_x_ptr = equiv_scale_x.data(); - T *equiv_bias_x_ptr = equiv_bias_x.data(); - T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; - T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; - T *y_ptr = - y.mutable_data({batch_size_, height_, width_, channels_}, place); + y.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); int c = channels_; int64_t nhw = ele_count_; int32_t c_int32_elems = ((c + 63) & ~63) / 32; int32_t nhw_int32_elems = (nhw + 31) & ~31; - int32_t *bitmask_ptr = bitmask.mutable_data( - {nhw_int32_elems, c_int32_elems, 1}, place); + bitmask.Resize(framework::make_ddim({nhw_int32_elems, c_int32_elems, 1})); auto data_shape = framework::vectorize(x.dims()); auto param_shape = framework::vectorize(bn_scale_x.dims()); @@ -651,8 +631,8 @@ class CudnnBNAddReluTester { op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, has_shortcut_, data_shape, param_shape, bitmask_shape); - sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, - bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); + sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, z, equiv_scale_z, + equiv_bias_z, &y, &bitmask); TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); @@ -697,19 +677,10 @@ class CudnnBNAddReluTester { saved_mean.Resize({1, 1, 1, channels_}); saved_var.Resize({1, 1, 1, channels_}); - T *dy_ptr = dy.data(); - T *x_ptr = x.data(); - float *bn_scale_ptr = bn_scale.data(); - float *bn_bias_ptr = bn_bias.data(); - float *saved_mean_ptr = saved_mean.data(); - float *saved_var_ptr = saved_var.data(); - int32_t *bitmask_ptr = bitmask.data(); - T *dx_ptr = - dx.mutable_data({batch_size_, height_, width_, channels_}, place); - T *dz_ptr = - dz.mutable_data({batch_size_, height_, width_, channels_}, place); - float *dscale_ptr = dscale.mutable_data({1, 1, 1, channels_}, place); - float *dbias_ptr = dbias.mutable_data({1, 1, 1, channels_}, place); + dx.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dz.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dscale.Resize(framework::make_ddim({1, 1, 1, channels_})); + dbias.Resize(framework::make_ddim({1, 1, 1, channels_})); auto data_shape = framework::vectorize(x.dims()); auto param_shape = framework::vectorize(bn_scale.dims()); @@ -718,9 +689,8 @@ class CudnnBNAddReluTester { std::string act_type = "relu"; op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, param_shape, bitmask_shape); - sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr, - dscale_ptr, dbias_ptr, eps_); + sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var, + bitmask, &dx, &dz, &dscale, &dbias, eps_); TensorCopySync(dx, platform::CPUPlace(), cpu_dx); TensorCopySync(dz, platform::CPUPlace(), cpu_dz); diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index 7d4b24cd4fc3d..dc703f9a822b5 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -68,12 +68,13 @@ class CudnnBNStatsFinalize { } ~CudnnBNStatsFinalize() {} - void Forward(const platform::CUDADeviceContext &ctx, float *sum_ptr, - float *sum_of_squares_ptr, float *scale_ptr, float *bias_ptr, - float *saved_mean_ptr, float *saved_invstd_ptr, - float *running_mean_ptr, float *running_var_ptr, - T *equiv_scale_ptr, T *equiv_bias_ptr, double eps, - float momentum, int64_t ele_count, bool is_train) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &sum, + const Tensor &sum_of_squares, const Tensor &scale, + const Tensor &bias, Tensor *saved_mean, Tensor *saved_invstd, + Tensor *running_mean, Tensor *running_var, Tensor *equiv_scale, + Tensor *equiv_bias, double eps, float momentum, + int64_t ele_count, bool is_train) { + auto place = ctx.GetPlace(); if (is_train) { TrainInit(ctx); } else { @@ -82,6 +83,17 @@ class CudnnBNStatsFinalize { auto &op = is_train ? train_op_ : inference_op_; // Set variant_param for both inference_op_ and train_op_ + float *sum_ptr = const_cast(sum.data()); + float *sum_of_squares_ptr = + const_cast(sum_of_squares.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = saved_mean->mutable_data(place); + float *saved_invstd_ptr = saved_invstd->mutable_data(place); + float *running_mean_ptr = running_mean->mutable_data(place); + float *running_var_ptr = running_var->mutable_data(place); + T *equiv_scale_ptr = equiv_scale->mutable_data(place); + T *equiv_bias_ptr = equiv_bias->mutable_data(place); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1a73281cb8dc6..9b9328a5ca620 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -38,7 +38,8 @@ struct NormConvolutionArgs { compute_type = platform::CudnnDataType::type; } - void Set(const std::vector &input_shape, + void Set(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, const std::vector &filter_shape, const std::vector &output_shape, int padding, int stride, int dilation, int group) { @@ -61,12 +62,33 @@ struct NormConvolutionArgs { "The filter_shape is expected to store as nhwc, and " "h = w = 1 or 3. But recieved filter_shape is [%s].", framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0), + true, + platform::errors::InvalidArgument( + "The input channel is expected to be multiple of 8, " + "and the output channel is expected to be multiple " + "of 32. But recieved input channel is %d, output " + "channel is %d.", + filter_shape[3], filter_shape[0])); PADDLE_ENFORCE_EQ( output_shape.size(), 4U, platform::errors::InvalidArgument( "The size of output_shape is expected to 4. But recieved " "filter_shape's size is %d, filter_shape is [%s].", output_shape.size(), framework::make_ddim(output_shape))); + is_support = IsSupport(ctx, filter_shape, stride, dilation, group); + PADDLE_ENFORCE_EQ( + is_support, true, + platform::errors::InvalidArgument( + "Current test is only supported in the platforms with " + "compatiblity greater than or equal to 70 and the kernel size " + "must be equal to 1 or 3. When the kernel size is 1, " + "the stride must be 1 if the compatiblity is equal to 70. " + "Besides, the dilation and group must be equal to 1. But recieved " + "compatiblity is %d, kernel size is %d, stride is %d, " + "dilation is %d, group is %d", + ctx.GetComputeCapability(), filter_shape[1], stride, dilation, + group)); for (size_t i = 0; i < input_shape.size(); ++i) { in_dims.push_back(input_shape[i]); @@ -89,6 +111,25 @@ struct NormConvolutionArgs { conv_desc.set(dtype, paddings, strides, dilations, false, group); } + bool IsSupport(const platform::CUDADeviceContext &ctx, + const std::vector &filter_shape, int stride, int dilation, + int group) { + int kernel_size = filter_shape[1]; + if (dilation != 1 || group != 1) { + return false; + } + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size == 3) || ((kernel_size == 1) && (stride == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size == 3) || (kernel_size == 1)) { + return true; + } + } + return false; + } + cudnnDataType_t dtype; cudnnTensorFormat_t format; cudnnDataType_t compute_type; @@ -104,6 +145,8 @@ struct NormConvolutionArgs { platform::TensorDescriptor out_desc; platform::TensorDescriptor out_stats_desc; platform::ConvolutionDescriptor conv_desc; + + bool is_support; }; template @@ -115,15 +158,16 @@ class CudnnNormConvolution { const std::vector &output_shape, const int &padding, const int &stride, const int &dilation, const int &group) { - args_.Set(input_shape, filter_shape, output_shape, padding, stride, + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, dilation, group); } ~CudnnNormConvolution() {} - void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *filter_ptr, T *output_ptr, float *sum_ptr, - float *sum_of_squares_ptr) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, Tensor *output, Tensor *sum, + Tensor *sum_of_squares) { auto cudnn_handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); CudnnFusionOp *fwd_op = GetForwardOp(ctx); size_t workspace_size = RoundUp( @@ -132,12 +176,17 @@ class CudnnNormConvolution { // Set variant_param // input ptr + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); fwd_op->SetOpVariantParamAttrPtr( CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); // output ptr + T *output_ptr = output->mutable_data(place); + float *sum_ptr = sum->mutable_data(place); + float *sum_of_squares_ptr = sum_of_squares->mutable_data(place); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); @@ -209,28 +258,34 @@ class CudnnNormConvolutionGrad { const std::vector &output_shape, const int &padding, const int &stride, const int &dilation, const int &group) { - args_.Set(input_shape, filter_shape, output_shape, padding, stride, + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, dilation, group); dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } ~CudnnNormConvolutionGrad() {} - void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, - T *filter_grad_ptr, bool use_addto = false) { - if (filter_grad_ptr) { - BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr, - filter_grad_ptr); + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, const Tensor &output_grad, + Tensor *input_grad, Tensor *filter_grad, + bool use_addto = false) { + auto place = ctx.GetPlace(); + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); + T *output_grad_ptr = const_cast(output_grad.data()); + + if (filter_grad) { + T *filter_grad_ptr = filter_grad->mutable_data(place); + BackwardFilter(ctx, output_grad_ptr, input_ptr, filter_grad_ptr); } - if (input_grad_ptr) { - BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr, - use_addto); + if (input_grad) { + T *input_grad_ptr = input_grad->mutable_data(place); + BackwardData(ctx, output_grad_ptr, filter_ptr, input_grad_ptr, use_addto); } } private: - void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) { + void BackwardFilter(const platform::CUDADeviceContext &ctx, + T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) { auto cudnn_handle = ctx.cudnn_handle(); CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); @@ -255,9 +310,8 @@ class CudnnNormConvolutionGrad { workspace_size); } - void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, - bool use_addto = false) { + void BackwardData(const platform::CUDADeviceContext &ctx, T *output_grad_ptr, + T *filter_ptr, T *input_grad_ptr, bool use_addto = false) { auto cudnn_handle = ctx.cudnn_handle(); size_t workspace_size = GetWorkspaceSizeBwdData(ctx); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 4c14029b99c69..23983d447e478 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -229,15 +229,6 @@ class CudnnNormConvolutionTester { platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); - if (!Support(*ctx)) { - LOG(INFO) - << "Current test is only supported in the platforms with " - << "compatiblity greater than or equal to 70 and the kernel size " - << "must be equal to 1 or 3. Besides, when the kernel size is 1, " - << "the stride must be 1 if the compatiblity is equal to 70."; - return; - } - framework::Tensor cpu_output_base; framework::Tensor cpu_sum_base; framework::Tensor cpu_sum_of_square_base; @@ -325,14 +316,10 @@ class CudnnNormConvolutionTester { TensorCopySync(cpu_input_, place, &input); TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); - T *input_ptr = input.data(); - T *filter_ptr = filter_nhwc.data(); - T *output_ptr = output.mutable_data( - {batch_size_, out_height_, out_width_, output_channels_}, place); - float *sum_ptr = - sum.mutable_data({1, 1, 1, output_channels_}, place); - float *sum_of_square_ptr = - sum_of_square.mutable_data({1, 1, 1, output_channels_}, place); + output.Resize(framework::make_ddim( + {batch_size_, out_height_, out_width_, output_channels_})); + sum.Resize(framework::make_ddim({1, 1, 1, output_channels_})); + sum_of_square.Resize(framework::make_ddim({1, 1, 1, output_channels_})); auto input_shape = framework::vectorize(input.dims()); auto filter_shape = framework::vectorize(filter_nhwc.dims()); @@ -340,8 +327,7 @@ class CudnnNormConvolutionTester { op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, output_shape, padding_, stride_, dilation_, group_); - conv_op.Forward(ctx, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_square_ptr); + conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square); TensorCopySync(output, platform::CPUPlace(), cpu_output); TensorCopySync(sum, platform::CPUPlace(), cpu_sum); @@ -362,11 +348,8 @@ class CudnnNormConvolutionTester { TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); TensorCopySync(cpu_output_grad_, place, &output_grad); - T *input_ptr = input.data(); - T *filter_ptr = filter_nhwc.data(); - T *output_grad_ptr = output_grad.data(); - T *input_grad_ptr = input_grad.mutable_data(input.dims(), place); - T *filter_grad_ptr = filter_grad.mutable_data(filter_nhwc.dims(), place); + input_grad.Resize(input.dims()); + filter_grad.Resize(filter_nhwc.dims()); auto input_shape = framework::vectorize(input.dims()); auto filter_shape = framework::vectorize(filter_nhwc.dims()); @@ -374,26 +357,13 @@ class CudnnNormConvolutionTester { op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, output_shape, padding_, stride_, dilation_, group_); - conv_grad_op.Backward(ctx, input_ptr, output_grad_ptr, filter_ptr, - input_grad_ptr, filter_grad_ptr); + conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad, + &filter_grad); TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } - bool Support(const platform::CUDADeviceContext &ctx) { - if (ctx.GetComputeCapability() == 70) { - if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) { - return true; - } - } else if (ctx.GetComputeCapability() > 70) { - if ((kernel_size_ == 3) || (kernel_size_ == 1)) { - return true; - } - } - return false; - } - private: int batch_size_; int height_; @@ -477,6 +447,15 @@ TEST(CudnnNormConvFp16, K1S2O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3)); + } } diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index 2fdb3635e2e14..b48c964d264ad 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -107,25 +107,33 @@ class CudnnScaleBiasAddRelu { ~CudnnScaleBiasAddRelu() {} - void Forward(const platform::CUDADeviceContext &ctx, T *x_ptr, T *x_scale_ptr, - T *x_bias_ptr, T *out_ptr, int32_t *bitmask_ptr, - T *z_ptr = nullptr, T *z_scale_ptr = nullptr, - T *z_bias_ptr = nullptr) { + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &x, + const Tensor &x_scale, const Tensor &x_bias, const Tensor &z, + const Tensor &z_scale, const Tensor &z_bias, Tensor *out, + Tensor *bitmask) { ForwardInit(ctx); auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param // input ptr + T *x_ptr = const_cast(x.data()); + T *x_scale_ptr = const_cast(x_scale.data()); + T *x_bias_ptr = const_cast(x_bias.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); if (has_shortcut_) { + T *z_ptr = const_cast(z.data()); + T *z_scale_ptr = const_cast(z_scale.data()); + T *z_bias_ptr = const_cast(z_bias.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); } else { if (fused_add_) { + T *z_ptr = const_cast(z.data()); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); } } @@ -134,6 +142,8 @@ class CudnnScaleBiasAddRelu { CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); // output ptr + T *out_ptr = out->mutable_data(place); + int32_t *bitmask_ptr = bitmask->mutable_data(place); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); @@ -147,16 +157,30 @@ class CudnnScaleBiasAddRelu { fwd_workspace_byte_); } - void Backward(const platform::CUDADeviceContext &ctx, T *dy_ptr, T *x_ptr, - float *scale_ptr, float *bias_ptr, float *saved_mean_ptr, - float *saved_invstd_ptr, int32_t *bitmask_ptr, T *dx_ptr, - T *dz_ptr, float *dscale_ptr, float *dbias_ptr, double eps) { + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &dy, + const Tensor &x, const Tensor &scale, const Tensor &bias, + const Tensor &saved_mean, const Tensor &saved_invstd, + const Tensor &bitmask, Tensor *dx, Tensor *dz, Tensor *dscale, + Tensor *dbias, double eps) { BackwardInit(ctx); auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param // input ptr + T *dy_ptr = const_cast(dy.data()); + T *x_ptr = const_cast(x.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = const_cast(saved_mean.data()); + float *saved_invstd_ptr = const_cast(saved_invstd.data()); + int32_t *bitmask_ptr = const_cast(bitmask.data()); + T *dx_ptr = dx->mutable_data(place); + T *dz_ptr = dz ? dz->mutable_data(place) : nullptr; + float *dscale_ptr = dscale ? dscale->mutable_data(place) : nullptr; + float *dbias_ptr = dbias ? dbias->mutable_data(place) : nullptr; + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc index 6842c07c1d9dc..65be35843bdf9 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -33,7 +33,6 @@ class LarsMomentumOp : public framework::OperatorWithKernel { "LarsMomentum"); OP_INOUT_CHECK(ctx->HasOutputs("VelocityOut"), "Output", "VelocityOut", "LarsMomentum"); - PADDLE_ENFORCE_EQ( ctx->GetInputsVarType("Param").front(), framework::proto::VarType::LOD_TENSOR, @@ -76,7 +75,6 @@ class LarsMomentumOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutputs("MasterParamOut"), "Output", "MasterParamOut", "LarsMomentumMultiPrecision"); } - for (size_t i = 0; i < lr_dims.size(); ++i) { PADDLE_ENFORCE_EQ(framework::product(lr_dims[i]), 1, platform::errors::InvalidArgument( @@ -104,8 +102,8 @@ class LarsMomentumOp : public framework::OperatorWithKernel { param_dim[i], velocity_dim[i], platform::errors::InvalidArgument( "Input(Param) and Input(Velocity) of LarsMomentumOp shall have " - "same dimension. But Param's dim is [%s] and Velocity`s dim " - "is [%s].", + "same dimension. But Param dim [%s] differs with Velocity dim " + "[%s].", param_dim[i], velocity_dim[i])); } ctx->SetOutputsDim("ParamOut", param_dim); @@ -165,7 +163,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0.001); AddAttr>( "lars_weight_decay", - "(float, default 0.0005) Merged LARS weight decay params") + "(std::vector, default 0.0005) LARS weight decay params") .SetDefault({0.0005}); AddAttr("epsilon", "(float, default 0.0) epsilon to avoid Division by Zero.") diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index c6de3bdd15dde..4fb501536168a 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -33,6 +33,18 @@ limitations under the License. */ namespace paddle { namespace operators { +template +using MultiPrecisionType = typename details::MPTypeTrait::Type; + +__device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } +__device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } +__device__ __forceinline__ float Fma(float x, float y, float z) { + return fmaf(x, y, z); +} +__device__ __forceinline__ double Fma(double x, double y, double z) { + return fma(x, y, z); +} + template class LarsThreadConfig { public: @@ -61,7 +73,6 @@ class LarsThreadConfig { grid_for_norm = std::min(grid, LARS_BLOCK_SIZE); const int grid_stride = grid_for_norm * LARS_BLOCK_SIZE; repeat_times = (numel + grid_stride - 1) / grid_stride - 1; - // Determine to read 4 fp16 or float data once, but 2 double data once. grid_for_lars = std::is_same::value @@ -71,18 +82,6 @@ class LarsThreadConfig { #endif }; -template -using MultiPrecisionType = typename details::MPTypeTrait::Type; - -__device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } -__device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } -__device__ __forceinline__ float Fma(float x, float y, float z) { - return fmaf(x, y, z); -} -__device__ __forceinline__ double Fma(double x, double y, double z) { - return fma(x, y, z); -} - template __device__ inline void VectorizeLarsUpdate( const T* __restrict__ grad, const MT* __restrict__ param, @@ -115,7 +114,6 @@ __device__ inline void VectorizeLarsUpdate( VecType grad_data = grad_vec[i]; VecMType param_data = param_vec[i]; VecMType velocity_data = velocity_vec[i]; - #pragma unroll for (int j = 0; j < VecSize; ++j) { MT grad_val = static_cast(grad_data[j]) * rescale_grad; @@ -147,16 +145,11 @@ __device__ inline void VectorizeLarsUpdate( } #if CUDA_VERSION >= 11000 -/* Once CUDA_VERSION is beyond 11.0, cooperative_groups can be involved in - without adding --rdc=true compile flag, then L2_norm cuda kernel can be - set as __device__ kernel and argument type cooperative_groups::grid_group - also can be involved. - On the contrary, the compile flag shall be set in old version, which may - affect the cuda kernel performance in paddle, consequently, L2_norm kernel - shall be set as __global__ kernel. -*/ -// TODO(limingshu): declaration of cooperative_groups wapper is invalid in host -// codes. +/* Once CUDA_VERSION is beyond 11, cooperative_groups can be involved in without + --rdc=true compile flag, then L2_norm kernel can be set with __device__ and + cooperative_groups::grid_group also can be involved. Otherwise, adding this + flag may affect much, L2_norm kernel shall be set with __global__.*/ +// TODO(limingshu): declaration of cooperative_groups wapper is invalid in host. template __forceinline__ __device__ void L2NormKernel( const cooperative_groups::grid_group* cg, @@ -168,36 +161,32 @@ __global__ void L2NormKernel( MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const int64_t numel, const int repeat_times, const MT rescale_grad, const int thresh = 0, MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; const MT rescale_pow = rescale_grad * rescale_grad; - - __shared__ MT s_buffer[2]; s_buffer[0] = static_cast(0); s_buffer[1] = static_cast(0); - MT p_arr_val = static_cast(0); - MT g_arr_val = static_cast(0); + MT p_tmp = static_cast(0); + MT g_tmp = static_cast(0); if (repeat_times == 0) { if (tid < numel) { - p_arr_val = static_cast(p_data[tid]); - g_arr_val = static_cast(g_data[tid]); + p_tmp = static_cast(p_data[tid]); + g_tmp = static_cast(g_data[tid]); } - s_buffer[0] += math::blockReduceSum(p_arr_val * p_arr_val, FINAL_MASK); - s_buffer[1] += math::blockReduceSum(g_arr_val * g_arr_val, FINAL_MASK); + s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); } else { - /* To avoid occupy too much temp buffer. Hence, slice the whole data into 2 - parts, the front of them whose quantity is excatly multiple of grid-thread - number, and this part of data is delt in for loop, the rest of data is delt - with another step to avoid visiting data address beyond bound. */ + /* Avoid occupy too much temp buffer. Slice the whole data into 2 parts, + the front of data whose quantity is excatly multiple of grid-thread + number, and delt in for loop, the rest is delt with another step. */ for (int i = 0; i < repeat_times; ++i) { - p_arr_val = static_cast(p_data[tid]); - g_arr_val = static_cast(g_data[tid]); + p_tmp = static_cast(p_data[tid]); + g_tmp = static_cast(g_data[tid]); tid += grid_stride; - s_buffer[0] += - math::blockReduceSum(p_arr_val * p_arr_val, FINAL_MASK); - s_buffer[1] += - math::blockReduceSum(g_arr_val * g_arr_val, FINAL_MASK); + s_buffer[0] += math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); __syncthreads(); } MT p_val = 0; @@ -216,8 +205,7 @@ __global__ void L2NormKernel( g_buffer[blockIdx.x] = s_buffer[1]; } #if CUDA_VERSION >= 11000 - // Grid sync for completely writring partial result back to gloabl memory - cg->sync(); + cg->sync(); // Grid sync for writring partial result to gloabl memory MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; *p_n = Sqrt(math::blockReduceSum(p_part_sum, FINAL_MASK)); @@ -249,8 +237,7 @@ __forceinline__ __device__ void MomentumUpdate( } else { if (std::is_same::value || std::is_same::value) { - /* TODO(limingshu): once type(param) is just fp16 type, - pointer cast may damage memory accessing. */ + /* TODO(limingshu): pointer cast may damage memory accessing for fp16 */ VectorizeLarsUpdate( grad, reinterpret_cast(param), velocity, param_out, velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, @@ -265,10 +252,8 @@ __forceinline__ __device__ void MomentumUpdate( } #if CUDA_VERSION >= 11000 - template struct LarsParamWarpper { - public: int64_t numel_arr[LARS_MAX_MERGED_OPS]; int repeat_arr[LARS_MAX_MERGED_OPS]; const T* __restrict__ p_arr[LARS_MAX_MERGED_OPS]; @@ -363,8 +348,7 @@ inline void SeparatedLarsMomentumOpCUDAKernel( param_data, grad_data, velocity_data, param_out_data, velocity_out_data, master_param_data, master_out_data, lr, p_buffer, g_buffer, mu, lars_coeff, weight_decay, epsilon, rescale_grad, 0, - lars_thread_config.grid_for_norm, numel, - is_amp); // 0 is just a placeholder. + lars_thread_config.grid_for_norm, numel, is_amp); } template @@ -387,13 +371,13 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); MT epsilon = static_cast(ctx.Attr("epsilon")); MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); + auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); auto grad = ctx.MultiInput("Grad"); auto param = ctx.MultiInput("Param"); auto velocity = ctx.MultiInput("Velocity"); auto param_out = ctx.MultiOutput("ParamOut"); auto velocity_out = ctx.MultiOutput("VelocityOut"); auto learning_rate = ctx.MultiInput("LearningRate"); - auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); auto master_param = ctx.MultiInput("MasterParam"); auto master_param_out = ctx.MultiOutput("MasterParamOut"); @@ -401,26 +385,21 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { int op_num = grad.size(); #if CUDA_VERSION >= 11000 if (op_num > 1) { + LarsParamWarpper lars_warpper; PADDLE_ENFORCE_LT( op_num, LARS_MAX_MERGED_OPS, platform::errors::InvalidArgument( "The maximum number of merged-ops supported is (%d), but" "lars op required for trainning this model is (%d)\n", LARS_MAX_MERGED_OPS, op_num)); - LarsParamWarpper lars_warpper; - /*Once model trainning with lars optimizer, whose principal implementation - is achieved by following two steps: + /* Implementation of lars optimizer consists of following two steps: 1. Figure out the L2 norm statistic result of grad data and param data. 2. Update param and velocity with usage of L2 norm statistic result. - Since the step1 is l2 norm statistic, grid level reduce is needed. To - achieve this and continuous calculation of step 2 in only one global - lanuch, essential basis is to control all grid-threads while running. - Instead of normal lanuch form, cuda provides `cudaLaunchCooperativeKernel` - api : + Step1 and step2 can be merged with api provided by nvida + cudaLaunchCooperativeKernel: - The thread quantity shall less than pyhsical SM limited threads - - Launches a device function where thread-block can cooperate and - synchronize as they execute. */ + - Launche as thread-block can synchronizlly execute. */ cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks_per_sm, MergedMomentumLarsKernel, LARS_BLOCK_SIZE, sizeof(MT) << 1); @@ -500,8 +479,6 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { num_blocks_per_sm); int repeat_times = lars_thread_config.GetRepeatTimes(numel); int thresh = 0; - - // Uniform kernel parameter for cudaLaunchCooperativeKernel void* cuda_param[] = { reinterpret_cast(¶m_data), reinterpret_cast(&grad_data), diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h index c5c83f6c664eb..da1f80eae9b02 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -30,7 +30,6 @@ class LarsMomentumOpKernel : public framework::OpKernel { auto learning_rate = ctx.MultiInput("LearningRate"); auto grad = ctx.MultiInput("Grad"); auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); - T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); T epsilon = ctx.Attr("epsilon"); @@ -47,7 +46,6 @@ class LarsMomentumOpKernel : public framework::OpKernel { auto p = framework::EigenVector::Flatten(*(param[i])); auto v = framework::EigenVector::Flatten(*(velocity[i])); auto g = framework::EigenVector::Flatten(*(grad[i])); - framework::Tensor p_norm_t, g_norm_t; p_norm_t.Resize({1}); g_norm_t.Resize({1}); diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 4fd7dc0d37ff8..c36213282c59c 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -39,10 +39,12 @@ if six.PY3: import subprocess import sys - if sys.platform == 'win32': - interpreter = sys.exec_prefix + "\\" + "python.exe" - else: - interpreter = sys.executable + import os + interpreter = sys.executable + # Note(zhouwei): if use Python/C 'PyRun_SimpleString', 'sys.executable' + # will be the C++ execubable on Windows + if sys.platform == 'win32' and 'python.exe' not in interpreter: + interpreter = sys.exec_prefix + os.sep + 'python.exe' import_cv2_proc = subprocess.Popen( [interpreter, "-c", "import cv2"], stdout=subprocess.PIPE, diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index 5832a17d70fb8..a95616a75c203 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -245,28 +245,31 @@ def do_test(use_nesterov=False, optimizer=""): def do_merge_test(optimizer=""): if optimizer is "Lars": suffix = "use Lars " - with self.scope_prog_guard(): - print("-----------------FP16 Merged Train {}-----------------". - format(suffix)) - train_loss_fp16_merge, test_loss_fp16_merge = train( - use_pure_fp16=True, - open_merge_option=True, - optimizer=optimizer) - with self.scope_prog_guard(): - print("-----------------FP32 Merged Train {}-----------------". - format(suffix)) - train_loss_fp32_merge, test_loss_fp32_merge = train( - use_pure_fp16=False, - open_merge_option=True, - optimizer=optimizer) - - with self.scope_prog_guard(): - print("-----------------FP32 Validation {}-----------------". - format(suffix)) - train_loss_fp32, test_loss_fp32 = train( - use_pure_fp16=False, - open_merge_option=False, - optimizer=optimizer) + with self.scope_prog_guard(): + print( + "-----------------FP16 Merged Train {}---------------". + format(suffix)) + train_loss_fp16_merge, test_loss_fp16_merge = train( + use_pure_fp16=True, + open_merge_option=True, + optimizer=optimizer) + with self.scope_prog_guard(): + print( + "-----------------FP32 Merged Train {}---------------". + format(suffix)) + train_loss_fp32_merge, test_loss_fp32_merge = train( + use_pure_fp16=False, + open_merge_option=True, + optimizer=optimizer) + + with self.scope_prog_guard(): + print( + "-----------------FP32 Validation {}-----------------". + format(suffix)) + train_loss_fp32, test_loss_fp32 = train( + use_pure_fp16=False, + open_merge_option=False, + optimizer=optimizer) self.assertTrue( np.allclose( diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py index 2a975bf00d1d2..273961e27efba 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -54,27 +54,9 @@ def attach_error_data(error, in_runtime=False): setattr(error, ERROR_DATA, error_data) - remove_static_file() return error -def remove_static_file(): - """ - Removes temporary files created during the transformation of dygraph to static graph. - """ - del_files = set() - for loc in global_origin_info_map: - static_filepath = loc[0] - del_files.add(static_filepath) - - filename, extension = os.path.splitext(static_filepath) - del_files.add(filename + ".pyc") - - for filepath in del_files: - if os.path.exists(filepath): - os.remove(filepath) - - class TraceBackFrame(OriginInfo): """ Traceback frame information. diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py index 8f11d00ccabf6..f9eecefdfb237 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -18,7 +18,7 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid @@ -26,7 +26,7 @@ SEED = 2021 -class TestConcat(OpTest): +class TestConcatOp(OpTest): def setUp(self): self.set_npu() self.op_type = "concat" @@ -56,54 +56,161 @@ def init_dtype(self): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out') + self.check_grad_with_place(self.place, ['x1'], 'Out') + self.check_grad_with_place(self.place, ['x2'], 'Out') + def init_test_data(self): self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) self.axis = 0 + +class TestConcatOp2(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.axis = 1 + + +@skip_check_grad_ci( + reason="The function 'check_grad' for large inputs is too slow.") +class TestConcatOp3(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((1, 256, 170, 256)).astype(self.dtype) + self.x1 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.x2 = np.random.random((1, 128, 170, 256)).astype(self.dtype) + self.axis = 1 + def test_check_grad(self): - self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out') - self.check_grad_with_place(self.place, ['x1'], 'Out') - self.check_grad_with_place(self.place, ['x2'], 'Out') + pass + + +@skip_check_grad_ci( + reason="This test will meet fetch error when there is a null grad. The detailed information is in PR#17015." +) +class TestConcatOp4(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((2, 3, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((0, 3, 4, 5)).astype(self.dtype) + self.axis = 0 + + def test_check_grad(self): + pass + + +class TestConcatOp5(TestConcatOp): + def init_test_data(self): + self.x0 = np.random.random((5, 1, 4, 5)).astype(self.dtype) + self.x1 = np.random.random((5, 2, 4, 5)).astype(self.dtype) + self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype) + self.axis = -3 + + +#----------------Concat Fp16---------------- +def create_test_fp16(parent): + class TestConcatFp16(parent): + def init_dtype(self): + self.dtype = np.float16 + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestConcatFp16.__name__ = cls_name + globals()[cls_name] = TestConcatFp16 + + +create_test_fp16(TestConcatOp) +create_test_fp16(TestConcatOp2) +create_test_fp16(TestConcatOp3) +create_test_fp16(TestConcatOp4) +create_test_fp16(TestConcatOp5) + + +#----------------Concat Int64---------------- +def create_test_int64(parent): + class TestConcatInt64(parent): + def init_dtype(self): + self.dtype = np.int64 + def test_check_grad(self): + pass + + cls_name = "{0}_{1}".format(parent.__name__, "Int64") + TestConcatInt64.__name__ = cls_name + globals()[cls_name] = TestConcatInt64 + + +create_test_int64(TestConcatOp) +create_test_int64(TestConcatOp2) +create_test_int64(TestConcatOp3) +create_test_int64(TestConcatOp4) +create_test_int64(TestConcatOp5) + + +class TestConcatAPIWithLoDTensorArray(unittest.TestCase): + """ + Test concat api when the input(x) is a LoDTensorArray. + """ -class TestConcatFP16(OpTest): def setUp(self): self.set_npu() - self.op_type = "concat" self.place = paddle.NPUPlace(0) - self.init_dtype() - self.init_test_data() - - self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} - self.attrs = {'axis': self.axis} - if self.axis < 0: - self.actual_axis = self.axis + len(self.x0.shape) - self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + self.axis = 1 + self.iter_num = 3 + self.input_shape = [2, 3] + self.x = np.random.random(self.input_shape).astype("float32") + + def set_program(self, use_fluid_api): + paddle.enable_static() + if use_fluid_api: + self.program = fluid.Program() + with fluid.program_guard(self.program): + input = fluid.layers.assign(self.x) + tensor_array = fluid.layers.create_array(dtype='float32') + zero = fluid.layers.fill_constant( + shape=[1], value=0, dtype="int64") + + for i in range(self.iter_num): + fluid.layers.array_write(input, zero + i, tensor_array) + + self.out_var = fluid.layers.concat(tensor_array, axis=self.axis) else: - self.actual_axis = self.axis + self.program = paddle.static.Program() + with paddle.static.program_guard(self.program): + input = paddle.assign(self.x) + tensor_array = fluid.layers.create_array( + dtype='float32' + ) # Api create_array is not supported in paddle 2.0 yet. + zero = paddle.zeros(shape=[1], dtype="int64") - self.outputs = { - 'Out': np.concatenate( - (self.x0, self.x1, self.x2), axis=self.actual_axis) - } + for i in range(self.iter_num): + # Api array_write is not supported in paddle 2.0 yet. + fluid.layers.array_write(input, zero + i, tensor_array) + + self.out_var = paddle.concat(tensor_array, axis=self.axis) def set_npu(self): self.__class__.use_npu = True - self.__class__.no_need_check_grad = True - - def init_dtype(self): - self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place) - - def init_test_data(self): - self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) - self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) - self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) - self.axis = 0 + def test_fluid_api(self): + self._run_static_mode(use_fluid_api=True) + + def test_paddle_api(self): + self._run_static_mode(use_fluid_api=False) + + def _run_static_mode(self, use_fluid_api): + self.set_program(use_fluid_api) + self.assertTrue(self.out_var.shape[self.axis] == -1) + exe = fluid.Executor(self.place) + res = exe.run(self.program, fetch_list=self.out_var) + self.assertTrue( + np.array_equal( + res[0], + np.concatenate( + [self.x] * self.iter_num, axis=self.axis))) if __name__ == '__main__': diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 14a925ef3e285..f5f0b5ed0873c 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -923,8 +923,6 @@ def mm(input, mat2, name=None): nontransposed, the prepended or appended dimension :math:`1` will be removed after matrix multiplication. - This op does not support broadcasting. See paddle.matmul. - Args: input (Tensor): The input tensor which is a Tensor. mat2 (Tensor): The input tensor which is a Tensor. @@ -949,9 +947,7 @@ def mm(input, mat2, name=None): """ if in_dygraph_mode(): - out = _varbase_creator(dtype=input.dtype) - _C_ops.matmul(input, mat2, out) - return out + return _C_ops.matmul_v2(input, mat2) def __check_input(x, y): var_names = {'x': x, 'y': y} @@ -991,7 +987,7 @@ def __check_input(x, y): helper = LayerHelper('mm', **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( - type='matmul', inputs={'X': input, + type='matmul_v2', inputs={'X': input, 'Y': mat2}, outputs={'Out': out}) return out