Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… prim_paddle
xiaoguoguo626807 · Jan 8, 2023 · ea5a9f7 · ea5a9f7
2 parents a457bdb + 67fc8e9
commit ea5a9f7
Show file tree

Hide file tree

Showing 40 changed files with 957 additions and 404 deletions.
diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.h b/paddle/fluid/distributed/auto_parallel/dist_attr.h
@@ -288,8 +288,8 @@ class OperatorDistAttr {
   std::string impl_type_ = kDefault;
   int64_t impl_idx_ = 0;
   bool is_recompute_ = false;
-  std::string execution_stream_;
-  int64_t scheduling_priority_;  // lower value, higher priority, default to 0
+  std::string execution_stream_ = kDefault;
+  int64_t scheduling_priority_ = 0;  // lower value, higher priority
   std::map<std::string, bool> annotated_;
 };
 

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -1398,9 +1398,11 @@ void PSGPUWrapper::build_task() {
 
 void PSGPUWrapper::BeginPass() {
   platform::Timer timer;
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
   if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
     return;
   }
+#endif
   timer.Start();
   if (current_task_) {
     PADDLE_THROW(
@@ -1426,9 +1428,11 @@ void PSGPUWrapper::BeginPass() {
 }
 
 void PSGPUWrapper::EndPass() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
   if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
     return;
   }
+#endif
   platform::Timer stagetime;
   stagetime.Start();
   HbmToSparseTable();

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -254,8 +254,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_fuse_pass",      //
 #endif                                         //
         "transpose_flatten_concat_fuse_pass",  //
-        "constant_folding_pass",               //
-        "auto_mixed_precision_pass",           //
         "conv2d_fusion_layout_transfer_pass",  //
         "auto_mixed_precision_pass",           //
         "inplace_op_var_pass",                 // should be the last pass.

diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
@@ -200,6 +200,12 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(true);
+    AddAttr<bool>(
+        "only_observer",
+        "(bool, default false) Whether to only observer or not. If "
+        "only_observer=false, it will calculate fake quant or dequant output. "
+        "If only_observer=true, it will only calibrate scale information.")
+        .SetDefault(false);
     AddComment(R"DOC(
 The scale of QuantizeLinear operator is a vector.
 In detail, each channel of the input X has a scale value.

diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h
@@ -61,6 +61,7 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
     int quant_axis = context.Attr<int>("quant_axis");
     bool is_test = context.Attr<bool>("is_test");
+    bool only_observer = context.Attr<bool>("only_observer");
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (quant_axis < 0) {
@@ -91,23 +92,39 @@ class QuantizeLinearKernel : public framework::OpKernel<T> {
                                                            out_state,
                                                            out_accum,
                                                            out_scale);
-        ClipAndFakeQuantFunctor<DeviceContext, T>()(
-            dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
+        if (only_observer) {
+          framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+        } else {
+          ClipAndFakeQuantFunctor<DeviceContext, T>()(
+              dev_ctx, *in, *out_scale, bin_cnt, round_type, out);
+        }
       } else {
-        ClipAndFakeQuantFunctor<DeviceContext, T>()(
-            dev_ctx, *in, *in_scale, bin_cnt, round_type, out);
+        if (only_observer) {
+          framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+        } else {
+          ClipAndFakeQuantFunctor<DeviceContext, T>()(
+              dev_ctx, *in, *in_scale, bin_cnt, round_type, out);
+        }
       }
     } else {
       if (!is_test) {
         auto* out_scale = context.Output<phi::DenseTensor>("OutScale");
         T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
         FindChannelAbsMaxFunctor<DeviceContext, T>()(
             dev_ctx, *in, quant_axis, out_scale_data);
-        ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-            dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
+        if (only_observer) {
+          framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+        } else {
+          ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+              dev_ctx, *in, *out_scale, bin_cnt, round_type, quant_axis, out);
+        }
       } else {
-        ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
-            dev_ctx, *in, *in_scale, bin_cnt, round_type, quant_axis, out);
+        if (only_observer) {
+          framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+        } else {
+          ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+              dev_ctx, *in, *in_scale, bin_cnt, round_type, quant_axis, out);
+        }
       }
     }
   }
@@ -132,6 +149,12 @@ class DeQuantizeLinearKernel : public framework::OpKernel<T> {
     int bit_length = context.Attr<int>("bit_length");
     auto quant_axis = context.Attr<int>("quant_axis");
     dev_ctx.template Alloc<D>(out, out->numel() * sizeof(D));
+    bool only_observer = context.Attr<bool>("only_observer");
+
+    if (only_observer) {
+      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+      return;
+    }
 
     if (quant_axis < 0) {
       float max_range = (std::pow(2, bit_length - 1) - 1);

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -676,6 +676,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("device_id") = 0,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
       .def("exp_enable_use_cutlass", &AnalysisConfig::Exp_EnableUseCutlass)
+      .def("exp_disable_mixed_precision_ops",
+           &AnalysisConfig::Exp_DisableMixedPrecisionOps)
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {

diff --git a/paddle/phi/backends/stream.h b/paddle/phi/backends/stream.h
@@ -44,7 +44,7 @@ class Stream {
   using Callback = std::function<void()>;
 
   Stream() = default;
-  // For compatiable
+  // For compatible
   Stream(const Place& place, stream_t stream);
   ~Stream();
   const stream_t& raw_stream() const;

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -66,6 +66,10 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64})},
       {"bilinear_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
       {"bilinear_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"bitwise_and", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_not", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_or", XPUKernelSet({phi::DataType::BOOL})},
+      {"bitwise_xor", XPUKernelSet({phi::DataType::BOOL})},
       {"broadcast", XPUKernelSet({phi::DataType::FLOAT32})},
       {"c_allgather",
        XPUKernelSet({phi::DataType::FLOAT16,

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
@@ -1785,20 +1785,22 @@ void KthvalueInferMeta(const MetaTensor& x,
                        MetaConfig config) {
   auto input_dims = x.dims();
   const int& dim_size = input_dims.size();
-  PADDLE_ENFORCE_LT(axis,
+  PADDLE_ENFORCE_LE(axis,
                     dim_size,
                     phi::errors::InvalidArgument(
                         "the axis must be [-%d, %d), but received %d .",
                         dim_size,
                         dim_size,
                         axis));
-  PADDLE_ENFORCE_GE(axis,
-                    -dim_size,
-                    phi::errors::InvalidArgument(
-                        "the axis must be [-%d, %d), but received %d .",
-                        dim_size,
-                        dim_size,
-                        axis));
+  if (dim_size > 0) {
+    PADDLE_ENFORCE_GE(axis,
+                      -dim_size,
+                      phi::errors::InvalidArgument(
+                          "the axis must be [-%d, %d), but received %d .",
+                          dim_size,
+                          dim_size,
+                          axis));
+  }
   if (axis < 0) axis += dim_size;
   PADDLE_ENFORCE_GE(
       k,
@@ -1807,9 +1809,9 @@ void KthvalueInferMeta(const MetaTensor& x,
           "the k in the kthvalue must >= 1, but received %d .", k));
   PADDLE_ENFORCE_GE(
       input_dims.size(),
-      1,
-      phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape"));
-  if (config.is_runtime) {
+      0,
+      phi::errors::InvalidArgument("input of kthvalue must have >= 0d shape"));
+  if (dim_size > 0 && config.is_runtime) {
     PADDLE_ENFORCE_GE(
         input_dims[axis],
         k,
@@ -1822,7 +1824,7 @@ void KthvalueInferMeta(const MetaTensor& x,
   for (int64_t i = 0; i < axis; i++) {
     dimvec.emplace_back(input_dims[i]);
   }
-  if (keepdim) {
+  if (keepdim && dim_size > 0) {
     dimvec.emplace_back(static_cast<int64_t>(1));
   }
   for (int64_t i = axis + 1; i < dim_size; i++) {
@@ -2071,33 +2073,38 @@ void ModeInferMeta(const MetaTensor& x,
                    MetaTensor* indices) {
   auto input_dims = x.dims();
   const int& dim_size = input_dims.size();
-  PADDLE_ENFORCE_EQ(
-      (axis < dim_size) && (axis >= (-1 * dim_size)),
-      true,
-      errors::InvalidArgument(
-          "the axis of ModeOp must be [-%d, %d), but you set axis is %d",
-          dim_size,
-          dim_size,
-          axis));
+  PADDLE_ENFORCE_LE(axis,
+                    dim_size,
+                    phi::errors::InvalidArgument(
+                        "the axis must be [-%d, %d), but received %d .",
+                        dim_size,
+                        dim_size,
+                        axis));
+  if (dim_size > 0) {
+    PADDLE_ENFORCE_GE(axis,
+                      -dim_size,
+                      phi::errors::InvalidArgument(
+                          "the axis must be [-%d, %d), but received %d .",
+                          dim_size,
+                          dim_size,
+                          axis));
+  }
   PADDLE_ENFORCE_GE(
       input_dims.size(),
-      1,
-      errors::InvalidArgument("input of ModeOp must have >= 1d shape"));
+      0,
+      errors::InvalidArgument("input of ModeOp must have >= 0d shape"));
   if (axis < 0) axis += dim_size;
   std::vector<int64_t> dimvec;
   for (int64_t i = 0; i < axis; i++) {
     dimvec.emplace_back(input_dims[i]);
   }
-  if (keepdim) {
+  if (keepdim && dim_size > 0) {
     dimvec.emplace_back(static_cast<int64_t>(1));
   }
   for (int64_t i = axis + 1; i < dim_size; i++) {
     dimvec.emplace_back(input_dims[i]);
   }
   DDim dims = phi::make_ddim(dimvec);
-  PADDLE_ENFORCE_GE(input_dims.size(),
-                    1,
-                    errors::InvalidArgument("input shape should >= 1d"));
   out->set_dims(dims);
   out->share_lod(x);
   out->set_dtype(x.dtype());

diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -55,6 +55,14 @@ void KthvalueGradKernel(const Context& dev_ctx,
                         DenseTensor* d_x) {
   auto in_dims = x.dims();
   auto out_dims = indices.dims();
+  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+
+  // For 0D Tensor
+  if (in_dims.size() == 0) {
+    phi::funcs::set_constant(dev_ctx, d_x, 1.0);
+    return;
+  }
+
   axis = (axis < 0) ? (in_dims.size() + axis) : axis;
   if (!keepdim) {
     std::vector<int> tmp_out_shape;
@@ -67,7 +75,7 @@ void KthvalueGradKernel(const Context& dev_ctx,
     }
     out_dims = phi::make_ddim(tmp_out_shape);
   }
-  T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+
   if (axis == in_dims.size() - 1) {
     const int64_t input_height =
         phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));

diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -82,8 +82,22 @@ void KthvalueKernel(const Context& dev_ctx,
                     DenseTensor* indices) {
   const auto& in_dims = x.dims();
   if (axis < 0) axis += in_dims.size();
+
   T* output_data = dev_ctx.template Alloc<T>(output);
   int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+  // For 0D Tensor
+  if (in_dims.size() == 0) {
+    PADDLE_ENFORCE_EQ(k,
+                      1,
+                      phi::errors::InvalidArgument(
+                          "the k in the kthvalue must less equal than the "
+                          "elemenents number of the input X, but received %d .",
+                          k));
+
+    phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, output);
+    phi::funcs::set_constant(dev_ctx, indices, 0);
+    return;
+  }
   auto out_dims = output->dims();
   if (axis == in_dims.size() - 1) {
     const int64_t& input_height =

diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -17,6 +17,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/mode.h"
 
 namespace phi {
@@ -32,9 +33,17 @@ void ModeGradKernel(const Context& dev_ctx,
   auto in_dims = x.dims();
   auto out_dims = indices.dims();
 
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
+
   // axis < 0, get the real axis
   axis = (axis < 0) ? (in_dims.size() + axis) : axis;
 
+  // For 0D Tensor
+  if (in_dims.size() == 0) {
+    phi::funcs::set_constant(dev_ctx, x_grad, 1.0);
+    return;
+  }
+
   if (!keepdim) {
     std::vector<int> tmp_out_shape;
     for (int i = 0; i < axis; i++) {
@@ -46,7 +55,6 @@ void ModeGradKernel(const Context& dev_ctx,
     }
     out_dims = phi::make_ddim(tmp_out_shape);
   }
-  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
 
   if (axis == in_dims.size() - 1) {
     // allocate the memory for the input_grad

diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/mode.h"
 
 namespace phi {
@@ -34,6 +35,13 @@ void ModeKernel(const Context& dev_ctx,
 
   T* output_data = dev_ctx.template Alloc<T>(out);
   int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
+
+  if (in_dims.size() == 0) {
+    phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    phi::funcs::set_constant(dev_ctx, indices, 0);
+    return;
+  }
+
   // if axis is not the last dim, transpose it to the last dim, do the
   // calculation, then tranpose it back to original axis.
   if (axis == in_dims.size() - 1) {

diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
 
 namespace phi {
@@ -43,8 +44,15 @@ void KthvalueGradKernel(const Context& dev_ctx,
                         DenseTensor* d_x) {
   const auto& in_dims = x.dims();
   auto out_dims = indices.dims();
-  if (axis < 0) axis += in_dims.size();
   T* x_grad_data = dev_ctx.template Alloc<T>(d_x);
+  // For 0D Tensor
+  if (in_dims.size() == 0) {
+    phi::funcs::set_constant(dev_ctx, d_x, 1.0);
+    return;
+  }
+
+  if (axis < 0) axis += in_dims.size();
+
   const T* out_grad_data = d_out.data<T>();
   const int64_t* indices_data = indices.data<int64_t>();
   int pre, n, post;