From d709ca2b94754879aa6e3092c8144af18fbd1a00 Mon Sep 17 00:00:00 2001 From: ZzSean <18818272991@163.com> Date: Sat, 9 Oct 2021 05:53:40 +0000 Subject: [PATCH] fix --- .../operators/fused/cudnn_bn_add_relu_test.cc | 471 ++++-------------- 1 file changed, 110 insertions(+), 361 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 7380b04840772..7229754cb8ed8 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -33,7 +33,6 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(batch_norm); -USE_CUDA_ONLY_OP(fused_bn_add_activation); template void InitRandomTensor(const std::vector &dims, @@ -41,7 +40,7 @@ void InitRandomTensor(const std::vector &dims, T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), platform::CPUPlace()); std::default_random_engine random(0); - std::uniform_real_distribution dis(-1.0, 1.0); + std::uniform_real_distribution dis(0.0, 1.0); for (int i = 0; i < cpu_out->numel(); ++i) { cpu_out_ptr[i] = static_cast(dis(random)); } @@ -90,7 +89,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res, } } std::string error_type = is_relative_atol ? "relative" : "absolute"; - LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims() + LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims() << "], maximum " << error_type << " error is " << max_diff << ": " << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; } @@ -122,33 +121,13 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, } } -template -void ComputeInplaceAdd(const framework::Tensor &cpu_x, - framework::Tensor *cpu_y) { - EXPECT_EQ(cpu_x.dims(), cpu_y->dims()); - - const T *cpu_x_ptr = cpu_x.data(); - T *cpu_y_ptr = cpu_y->data(); - for (int64_t i = 0; i < cpu_x.numel(); ++i) { - cpu_y_ptr[i] += cpu_x_ptr[i]; - } -} - -template -void ComputeInplaceRelu(framework::Tensor *cpu_x) { - T *cpu_x_ptr = cpu_x->data(); - for (int64_t i = 0; i < cpu_x->numel(); ++i) { - cpu_x_ptr[i] = - cpu_x_ptr[i] > static_cast(0) ? cpu_x_ptr[i] : static_cast(0); - } -} - +// get paddle batchnorm op results as baseline void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *saved_reserve_space) { + Tensor *cpu_reserve_space) { framework::Scope scope; auto *x = scope.Var("X")->GetMutable(); auto *scale = scope.Var("Scale")->GetMutable(); @@ -199,163 +178,68 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, TensorCopySync(*var, platform::CPUPlace(), cpu_var); TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); - // reserved_space will stay on GPU and used in grad op. - saved_reserve_space->ShareDataWith(*reserve_space); -} - -void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, - const Tensor &cpu_x, const Tensor &cpu_z, - const Tensor &cpu_scale, - const Tensor &cpu_bias, Tensor *cpu_mean, - Tensor *cpu_var, Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *saved_reserve_space) { - framework::Scope scope; - auto *x = scope.Var("X")->GetMutable(); - auto *z = scope.Var("Z")->GetMutable(); - auto *scale = scope.Var("Scale")->GetMutable(); - auto *bias = scope.Var("Bias")->GetMutable(); - auto *mean = scope.Var("Mean")->GetMutable(); - auto *var = scope.Var("Variance")->GetMutable(); - auto *y = scope.Var("Y")->GetMutable(); - auto *saved_mean = scope.Var("SavedMean")->GetMutable(); - auto *saved_var = - scope.Var("SavedVariance")->GetMutable(); - auto *reserve_space = - scope.Var("ReserveSpace")->GetMutable(); - - auto place = ctx.GetPlace(); - TensorCopySync(cpu_x, place, x); - TensorCopySync(cpu_z, place, z); - TensorCopySync(cpu_scale, place, scale); - TensorCopySync(cpu_bias, place, bias); - TensorCopySync(*cpu_mean, place, mean); - TensorCopySync(*cpu_var, place, var); - - int64_t channels = x->dims()[3]; - scale->Resize({channels}); - bias->Resize({channels}); - mean->Resize({channels}); - var->Resize({channels}); - - framework::AttributeMap attrs; - - auto op = framework::OpRegistry::CreateOp( - "fused_bn_add_activation", - {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, - {{"Y", {"Y"}}, - {"MeanOut", {"Mean"}}, - {"VarianceOut", {"Variance"}}, - {"SavedMean", {"SavedMean"}}, - {"SavedVariance", {"SavedVariance"}}, - {"ReserveSpace", {"ReserveSpace"}}}, - attrs); - op->Run(scope, ctx.GetPlace()); - - TensorCopySync(*y, platform::CPUPlace(), cpu_y); - TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); - TensorCopySync(*var, platform::CPUPlace(), cpu_var); - TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); - TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); - // reserved_space will stay on GPU and used in grad op. - saved_reserve_space->ShareDataWith(*reserve_space); + TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space); } template class CudnnBNAddReluTester { public: - CudnnBNAddReluTester(int batch_size, int height, int width, int channels, - std::string act_type, bool fuse_add, bool has_shortcut) { + CudnnBNAddReluTester(int batch_size, int height, int width, int channels) { batch_size_ = batch_size; height_ = height; width_ = width; channels_ = channels; ele_count_ = batch_size_ * height_ * width_; - act_type_ = act_type; - fuse_add_ = fuse_add; - has_shortcut_ = has_shortcut; SetUp(); } ~CudnnBNAddReluTester() {} void CheckForward(float diff, bool is_relative_atol = false) { - LOG(INFO) << "[CheckForward, diff=" << diff - << ", is_relative_atol=" << is_relative_atol - << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ - << ", has_shortcut=" << has_shortcut_; platform::CUDADeviceContext *ctx = static_cast( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(0))); - auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; - - framework::Tensor cpu_mean_base_x; - framework::Tensor cpu_var_base_x; - framework::Tensor cpu_mean_base_z; - framework::Tensor cpu_var_base_z; - if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) { - BaselineForwardFusedBNAddRelu( - *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, - &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_); - } else { - BaselineForward( - *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, - &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_, - select(&cpu_mean_base_z), select(&cpu_var_base_z), - select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_), - select(&saved_reserve_space_z_)); - } - - framework::Tensor cpu_mean_x; - framework::Tensor cpu_var_x; + framework::Tensor cpu_mean_base; + framework::Tensor cpu_var_base; + framework::Tensor cpu_saved_mean_base; + framework::Tensor cpu_saved_var_base; + framework::Tensor cpu_y_base; + framework::Tensor cpu_reserve_space_base; + BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base, + &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base); + + framework::Tensor cpu_mean; + framework::Tensor cpu_var; + framework::Tensor cpu_saved_mean; + framework::Tensor cpu_saved_var; framework::Tensor cpu_y; - framework::Tensor cpu_mean_z; - framework::Tensor cpu_var_z; - FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_, - &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z), - select(&cpu_var_z), select(&cpu_saved_mean_z_), - select(&cpu_saved_var_z_)); - - CheckOutput("Mean", cpu_mean_x, cpu_mean_base_x, diff, + framework::Tensor cpu_bitmask; + FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var, + &cpu_y, &cpu_bitmask); + + CheckOutput("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol); + CheckOutput("Variance", cpu_var, cpu_var_base, diff, is_relative_atol); - CheckOutput("Variance", cpu_var_x, cpu_var_base_x, diff, + CheckOutput("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff, is_relative_atol); - CheckOutput("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_, - diff, is_relative_atol); - CheckOutput("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_, - diff, is_relative_atol); - if (has_shortcut_) { - CheckOutput("MeanZ", cpu_mean_z, cpu_mean_base_z, diff, - is_relative_atol); - CheckOutput("VarianceZ", cpu_var_z, cpu_var_base_z, diff, - is_relative_atol); - CheckOutput("SavedMeanZ", cpu_saved_mean_z_, - cpu_saved_mean_base_z_, diff, is_relative_atol); - CheckOutput("SavedVarianceZ", cpu_saved_var_z_, - cpu_saved_var_base_z_, diff, is_relative_atol); - } - CheckOutput("Y", cpu_y, cpu_y_base_, diff, is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff, + is_relative_atol); + CheckOutput("Y", cpu_y, cpu_y_base, diff, is_relative_atol); } private: void SetUp() { + // Initialize input data InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); - InitRandomTensor({channels_}, &cpu_bn_scale_x_); - InitRandomTensor({channels_}, &cpu_bn_bias_x_); - - if (has_shortcut_) { - InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); - InitRandomTensor({channels_}, &cpu_bn_scale_z_); - InitRandomTensor({channels_}, &cpu_bn_bias_z_); - } else { - if (fuse_add_) { - InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); - } - } + ComputeSumAndSquareSum(cpu_x_, &cpu_sum_, &cpu_sum_of_square_); - InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_dy_); + // scale and bias should be initialized randomly. + InitConstantTensor({channels_}, static_cast(1.0f), + &cpu_bn_scale_); + InitConstantTensor({channels_}, static_cast(0.0f), + &cpu_bn_bias_); } void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, @@ -368,169 +252,71 @@ class CudnnBNAddReluTester { cpu_saved_var); } - void BaselineForward(const platform::CUDADeviceContext &ctx, - Tensor *cpu_mean_x, Tensor *cpu_var_x, - Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x, - Tensor *cpu_y, Tensor *saved_reserve_space_x, - Tensor *cpu_mean_z = nullptr, - Tensor *cpu_var_z = nullptr, - Tensor *cpu_saved_mean_z = nullptr, - Tensor *cpu_saved_var_z = nullptr, - Tensor *saved_reserve_space_z = nullptr) { - InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); - ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, - cpu_mean_x, cpu_var_x, cpu_saved_mean_x, - cpu_saved_var_x, cpu_y, saved_reserve_space_x); - if (has_shortcut_) { - framework::Tensor cpu_z_out; - InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); - ComputeBatchNormForward( - ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z, - cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z); - ComputeInplaceAdd(cpu_z_out, cpu_y); - } else { - if (fuse_add_) { - ComputeInplaceAdd(cpu_z_, cpu_y); - } - } - if (act_type_ == "relu") { - ComputeInplaceRelu(cpu_y); - } - } - - void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, - Tensor *cpu_mean, Tensor *cpu_var, - Tensor *cpu_saved_mean, - Tensor *cpu_saved_var, Tensor *cpu_y, - Tensor *saved_reserve_space) { + void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *cpu_reserve_space) { InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); - ComputeFusedBNAddReluForward( - ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var, - cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space); - } - - void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, - const Tensor &cpu_x, - const Tensor &cpu_bn_scale, - const Tensor &cpu_bn_bias, Tensor *sum, - Tensor *sum_of_square, Tensor *bn_scale, - Tensor *bn_bias, Tensor *mean, Tensor *var, - Tensor *saved_mean, Tensor *saved_var, - Tensor *equiv_scale, Tensor *equiv_bias) { - framework::Tensor cpu_sum; - framework::Tensor cpu_sum_of_square; - ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); - - auto place = ctx.GetPlace(); - TensorCopySync(cpu_sum, place, sum); - TensorCopySync(cpu_sum_of_square, place, sum_of_square); - TensorCopySync(cpu_bn_scale, place, bn_scale); - TensorCopySync(cpu_bn_bias, place, bn_bias); - - bn_scale->Resize({1, 1, 1, channels_}); - bn_bias->Resize({1, 1, 1, channels_}); - - // input - float *sum_ptr = sum->data(); - float *sum_of_square_ptr = sum_of_square->data(); - float *bn_scale_ptr = bn_scale->data(); - float *bn_bias_ptr = bn_bias->data(); - - mean->Resize({1, 1, 1, channels_}); - var->Resize({1, 1, 1, channels_}); - - // output - float *mean_ptr = mean->data(); - float *var_ptr = var->data(); - float *saved_mean_ptr = - saved_mean->mutable_data({1, 1, 1, channels_}, place); - float *saved_var_ptr = - saved_var->mutable_data({1, 1, 1, channels_}, place); - T *equiv_scale_ptr = - equiv_scale->mutable_data({1, 1, 1, channels_}, place); - T *equiv_bias_ptr = - equiv_bias->mutable_data({1, 1, 1, channels_}, place); - - auto param_shape = framework::vectorize(bn_scale->dims()); - op::CudnnBNStatsFinalize bn_op(ctx, param_shape); - bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, - saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, - equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, - true); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean, + cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y, + cpu_reserve_space); } // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu - void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x, - Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, - Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask, - Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr, - Tensor *cpu_saved_mean_z = nullptr, - Tensor *cpu_saved_var_z = nullptr) { + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) { framework::Tensor x; - framework::Tensor sum_x; - framework::Tensor sum_of_square_x; - framework::Tensor bn_scale_x; - framework::Tensor bn_bias_x; - - framework::Tensor z; - framework::Tensor sum_z; - framework::Tensor sum_of_square_z; - framework::Tensor bn_scale_z; - framework::Tensor bn_bias_z; + framework::Tensor sum; + framework::Tensor sum_of_square; + framework::Tensor bn_scale; + framework::Tensor bn_bias; auto place = ctx.GetPlace(); TensorCopySync(cpu_x_, place, &x); - if (fuse_add_ || has_shortcut_) { - TensorCopySync(cpu_z_, place, &z); - } - - framework::Tensor mean_x; - framework::Tensor var_x; - framework::Tensor saved_mean_x; - framework::Tensor saved_var_x; - framework::Tensor equiv_scale_x; - framework::Tensor equiv_bias_x; + TensorCopySync(cpu_sum_, place, &sum); + TensorCopySync(cpu_sum_of_square_, place, &sum_of_square); + TensorCopySync(cpu_bn_scale_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_, place, &bn_bias); - framework::Tensor mean_z; - framework::Tensor var_z; - framework::Tensor saved_mean_z; - framework::Tensor saved_var_z; - framework::Tensor equiv_scale_z; - framework::Tensor equiv_bias_z; + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + T *x_ptr = x.data(); + float *sum_ptr = sum.data(); + float *sum_of_square_ptr = sum_of_square.data(); + float *bn_scale_ptr = bn_scale.data(); + float *bn_bias_ptr = bn_bias.data(); + + framework::Tensor mean; + framework::Tensor var; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor equiv_scale; + framework::Tensor equiv_bias; framework::Tensor y; framework::Tensor bitmask; - InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); - TensorCopySync(*cpu_mean_x, place, &mean_x); - TensorCopySync(*cpu_var_x, place, &var_x); - if (has_shortcut_) { - InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); - TensorCopySync(*cpu_mean_z, place, &mean_z); - TensorCopySync(*cpu_var_z, place, &var_z); - } + InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); + TensorCopySync(*cpu_mean, place, &mean); + TensorCopySync(*cpu_var, place, &var); - // 1. BN Stats Finalize - ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, - &sum_x, &sum_of_square_x, &bn_scale_x, - &bn_bias_x, &mean_x, &var_x, &saved_mean_x, - &saved_var_x, &equiv_scale_x, &equiv_bias_x); - if (has_shortcut_) { - ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, - &sum_z, &sum_of_square_z, &bn_scale_z, - &bn_bias_z, &mean_z, &var_z, &saved_mean_z, - &saved_var_z, &equiv_scale_z, &equiv_bias_z); - } + mean.Resize({1, 1, 1, channels_}); + var.Resize({1, 1, 1, channels_}); - T *x_ptr = x.data(); - T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; - T *equiv_scale_x_ptr = equiv_scale_x.data(); - T *equiv_bias_x_ptr = equiv_bias_x.data(); - T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; - T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; + float *mean_ptr = mean.data(); + float *var_ptr = var.data(); + float *saved_mean_ptr = + saved_mean.mutable_data({1, 1, 1, channels_}, place); + float *saved_var_ptr = + saved_var.mutable_data({1, 1, 1, channels_}, place); + T *equiv_scale_ptr = + equiv_scale.mutable_data({1, 1, 1, channels_}, place); + T *equiv_bias_ptr = equiv_bias.mutable_data({1, 1, 1, channels_}, place); T *y_ptr = y.mutable_data({batch_size_, height_, width_, channels_}, place); + // bitmask int c = channels_; int64_t nhw = ele_count_; int32_t c_int32_elems = ((c + 63) & ~63) / 32; @@ -539,26 +325,27 @@ class CudnnBNAddReluTester { {nhw_int32_elems, c_int32_elems, 1}, place); auto data_shape = framework::vectorize(x.dims()); - auto param_shape = framework::vectorize(bn_scale_x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); auto bitmask_shape = framework::vectorize(bitmask.dims()); - // 2. Scale Bias + Relu - op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, - has_shortcut_, data_shape, param_shape, - bitmask_shape); - sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, - bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); - - TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); - TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); - TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); - TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); - if (has_shortcut_) { - TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); - TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); - TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); - TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); - } + // 1. BN Stats Finalize + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, + equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, + true); + + // 2. Scale Bias + Relu (not fused add) + std::string act_type = ""; + op::CudnnScaleBiasAddRelu sbar_op( + ctx, act_type, false, false, data_shape, param_shape, bitmask_shape); + sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr, + bitmask_ptr); + + TensorCopySync(mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(var, platform::CPUPlace(), cpu_var); + TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var); TensorCopySync(y, platform::CPUPlace(), cpu_y); TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); } @@ -570,62 +357,24 @@ class CudnnBNAddReluTester { int channels_; int ele_count_; - std::string act_type_; - bool fuse_add_; - bool has_shortcut_; - // Forward input framework::Tensor cpu_x_; - framework::Tensor cpu_bn_scale_x_; - framework::Tensor cpu_bn_bias_x_; - framework::Tensor cpu_z_; - framework::Tensor cpu_bn_scale_z_; - framework::Tensor cpu_bn_bias_z_; - - // Backward input - framework::Tensor cpu_dy_; - framework::Tensor cpu_bitmask_; - framework::Tensor cpu_saved_mean_x_; - framework::Tensor cpu_saved_var_x_; - framework::Tensor cpu_saved_mean_z_; - framework::Tensor cpu_saved_var_z_; - framework::Tensor cpu_saved_mean_base_x_; - framework::Tensor cpu_saved_var_base_x_; - framework::Tensor saved_reserve_space_x_; - framework::Tensor cpu_saved_mean_base_z_; - framework::Tensor cpu_saved_var_base_z_; - framework::Tensor saved_reserve_space_z_; - framework::Tensor cpu_y_base_; + framework::Tensor cpu_sum_; + framework::Tensor cpu_sum_of_square_; + framework::Tensor cpu_bn_scale_; + framework::Tensor cpu_bn_bias_; double eps_ = 1e-5; float momentum_ = 0.9; }; -TEST(CudnnBNAddReluFp16, BNAdd) { - int batch_size = 4; - int height = 8; - int width = 8; - int channels = 64; - std::string act_type = ""; - bool has_shortcut = false; - FLAGS_cudnn_batchnorm_spatial_persistent = true; - for (auto fuse_add : {false, true}) { - CudnnBNAddReluTester test( - batch_size, height, width, channels, act_type, fuse_add, has_shortcut); - test.CheckForward(2e-3); - } -} - -TEST(CudnnBNAddReluFp16, HasShortcut) { +TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) { int batch_size = 4; int height = 8; int width = 8; int channels = 64; - std::string act_type = ""; - bool fuse_add = false; - bool has_shortcut = true; FLAGS_cudnn_batchnorm_spatial_persistent = true; - CudnnBNAddReluTester test( - batch_size, height, width, channels, act_type, fuse_add, has_shortcut); - test.CheckForward(5e-3); + CudnnBNAddReluTester test(batch_size, height, + width, channels); + test.CheckForward(2e-3); }