HydrogenSulfate · HydrogenSulfate · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -246,8 +246,10 @@ void ApplyCinnPass(::pir::Program* program,
       .file_name("original_programs.py")
       .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
       .SaveIfFlagEnabled();
-  ApplyShapeOptimizationPass(program, CreatePassManager);
   ApplyPdToCinnPass(program, CreatePassManager);
+  // TODO(Hongqing-work): move ApplyShapeOptimizationPass before
+  // ApplyPdToCinnPass after fixing infer shape bug.
+  ApplyShapeOptimizationPass(program, CreatePassManager);
   ApplyCinnPreprocessPass(program, CreatePassManager);
   ApplyBuildGroupOpPass(program, CreatePassManager);
   PirToPyCodeConverter(program)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
@@ -75,23 +76,34 @@ void VisitFeedName(const pir::Program& program,
                    const DoEachFeadNameT& DoEachFeadName) {
   auto module_op = program.module_op();
   const auto& block = module_op.block();
-  const auto& IsDataOp = [](const pir::Operation& op) -> bool {
-    return op.isa<paddle::dialect::DataOp>();
-  };
-  const auto& GetDataOpName = [](const pir::Operation& op) -> std::string {
+  auto GetDataOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::DataOp>()) return std::nullopt;
     return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
-  const auto& IsFeedOp = [](const pir::Operation& op) -> bool {
-    return op.isa<paddle::dialect::FeedOp>();
+  auto GetFeedOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::FeedOp>()) return std::nullopt;
+    return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
-  const auto& GetFeedOpName = [](const pir::Operation& op) -> std::string {
+  auto GetPhiFeedOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::PhiKernelOp>()) return std::nullopt;
+    const auto& attributes = op.attributes();
+    const auto& op_name_it = attributes.find("op_name");
+    if (op_name_it == attributes.end()) return std::nullopt;
+    const auto& op_name =
+        op_name_it->second.dyn_cast<pir::StrAttribute>().AsString();
+    if (op_name != "pd_op.feed") return std::nullopt;
     return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
   for (const auto& op : block) {
-    if (IsDataOp(op)) {
-      DoEachFeadName(GetDataOpName(op));
-    } else if (IsFeedOp(op)) {
-      DoEachFeadName(GetFeedOpName(op));
+    if (const auto& name = GetDataOpName(op)) {
+      DoEachFeadName(name.value());
+    } else if (const auto& name = GetFeedOpName(op)) {
+      DoEachFeadName(name.value());
+    } else if (const auto& name = GetPhiFeedOpName(op)) {
+      DoEachFeadName(name.value());
     } else {
       // Do nothing.
     }
@@ -1431,34 +1443,48 @@ std::optional<pir::ShapeConstraintIRAnalysis*> GetNullShapeAnalysis(
   return std::nullopt;
 }
 
+void TryTruncateLogginFile(const std::string& file_path) {
+  if (!FLAGS_logging_trunc_pir_py_code) return;
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  static std::unordered_map<std::string, std::once_flag> once_flags;
+  std::call_once(once_flags[file_path], [&] {
+    std::ofstream ofs;
+    ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
+    ofs.close();
+  });
+}
+
 }  // namespace
 
 void PirToPyCodeConverter::SaveIfFlagEnabled() const {
   if (program_ == nullptr) return;
   if (file_name_.empty()) return;
-  if (FLAGS_logging_pir_py_code_dir == "") return;
+  if (FLAGS_logging_pir_py_code_dir.empty()) return;
   const std::string file_path =
       FLAGS_logging_pir_py_code_dir + "/" + file_name_;
-  ShapeAnalysisGetterT ShapeAnalysisGetter =
-      (dump_symbolic_shape_ ? GetShapeAnalysisFromManager
-                            : GetNullShapeAnalysis);
-  PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
-  const std::string content = converter_helper.Convert();
-  static std::mutex mutex;
-  std::unique_lock<std::mutex> lock(mutex);
-  if (FLAGS_logging_trunc_pir_py_code) {
-    static std::unordered_map<std::string, std::once_flag> once_flags;
-    std::call_once(once_flags[file_path], [&] {
-      std::ofstream ofs;
-      ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
-      ofs.close();
-    });
-  }
-  std::ofstream ofs;
-  ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
-  if (!ofs.is_open()) return;
-  ofs << content << std::endl;
-  ofs.close();
+  TryTruncateLogginFile(file_path);
+  const auto MutOnceFlag = [&]() -> std::once_flag* {
+    static std::mutex mutex;
+    std::unique_lock<std::mutex> lock(mutex);
+    using FileName = std::string;
+    using FileName2OnceFlag = std::unordered_map<FileName, std::once_flag>;
+    using ProgramId = int64_t;
+    static std::unordered_map<ProgramId, FileName2OnceFlag> once_flags;
+    return &once_flags[program_->id()][file_name_];
+  };
+  std::call_once(*MutOnceFlag(), [&] {
+    ShapeAnalysisGetterT ShapeAnalysisGetter =
+        (dump_symbolic_shape_ ? GetShapeAnalysisFromManager
+                              : GetNullShapeAnalysis);
+    PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
+    const std::string content = converter_helper.Convert();
+    std::ofstream ofs;
+    ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
+    if (!ofs.is_open()) return;
+    ofs << content << std::endl;
+    ofs.close();
+  });
 }
 
 void DumpExecProgram(const pir::Program& program,

diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
@@ -1758,6 +1758,18 @@ PHI_DEFINE_EXPORTED_string(
     "If default, "
     "dlopen will search mkl from LD_LIBRARY_PATH");
 
+/**
+ * Apply global search in blaslt FLAG
+ * Name: enable_blaslt_global_search
+ * Since Version: 3.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, will apply global search in blaslt.
+ */
+PHI_DEFINE_EXPORTED_bool(enable_blaslt_global_search,
+                         false,
+                         "Whether to use global search in blaslt.");
+
 PHI_DEFINE_EXPORTED_string(op_dir,  // NOLINT
                            "",
                            "Specify path for loading user-defined op library.");

diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
@@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() {
   // When using cuda graph, fast GC must be used. Because
   // `EventQuery` method in event GC cannot be used in
   // cuda graph.
+  PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
+                                .IsStreamSafeCUDAAllocatorUsed() == true &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == true,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "StreamSafeAllocator and AsyncAllocator shouldn't be "
+                        "True together."));
   PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
                                 .IsStreamSafeCUDAAllocatorUsed() == false &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == false &&
                         FLAGS_new_executor_use_cuda_graph,
                     false,
                     platform::errors::InvalidArgument(
                         "When FLAGS_new_executor_use_cuda_graph is true, "
-                        "IsStreamSafeCUDAAllocatorUsed must be true, but "
+                        "Either IsStreamSafeCUDAAllocatorUsed or "
+                        "IsCUDAMallocAsyncAllocatorUsed must be true, but "
                         "got false."));
   return (memory::allocation::AllocatorFacade::Instance()
               .IsStreamSafeCUDAAllocatorUsed() &&

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -139,6 +139,7 @@
 #include "paddle/pir/include/pass/pass_registry.h"
 
 COMMON_DECLARE_bool(pir_apply_inplace_pass);
+COMMON_DECLARE_bool(enable_pir_api);
 
 namespace paddle {
 namespace {
@@ -390,6 +391,10 @@ AnalysisPredictor::AnalysisPredictor(const AnalysisConfig &config)
   if (config_.shape_range_info_collected()) {
     config_.SwitchIrOptim(false);
   }
+  if (FLAGS_enable_pir_api) {
+    config_.EnableNewExecutor(true);
+    config_.EnableNewIR(true);
+  }
   if (config_.new_executor_enabled()) {
     config_.EnableMemoryOptim(false);
     if (config_.new_ir_enabled()) {

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -264,6 +264,11 @@ class AllocatorFacadePrivate {
         // application, treating it separately can avoid lots of overhead of
         // acquiring default stream and applying read-write lock.
         if (FLAGS_use_cuda_malloc_async_allocator) {
+          PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory,
+                            false,
+                            platform::errors::InvalidArgument(
+                                "Async allocator cannot be used with CUDA "
+                                "managed memory."));
           WrapCUDAMallocAsyncAllocatorForDefault();
           is_cuda_malloc_async_allocator_used_ = true;
         } else {
@@ -871,6 +876,11 @@ class AllocatorFacadePrivate {
             "the allocator strategy %d is unsupported for multi-stream",
             static_cast<int>(strategy_)));
     if (FLAGS_use_cuda_malloc_async_allocator) {
+      PADDLE_ENFORCE_EQ(
+          FLAGS_use_cuda_managed_memory,
+          false,
+          platform::errors::InvalidArgument(
+              "Async allocator cannot be used with CUDA managed memory."));
       VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream "
               << stream << " in place " << p;
       InitCUDAMallocAsyncAllocator(p, stream);

diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
@@ -105,6 +105,23 @@ PD_REGISTER_KERNEL(save,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
+#ifdef PADDLE_WITH_XPU
+PD_REGISTER_KERNEL(save,
+                   XPU,
+                   ALL_LAYOUT,
+                   ops::SaveKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif
+
 PD_REGISTER_KERNEL(save_sr,
                    CPU,
                    ALL_LAYOUT,

diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -252,27 +252,19 @@ bool AddNOpInferSymbolicShape(pir::Operation *op,
           "should be larger than 0. But received X's dimensions %d.",
           inputs_shape.size()));
   symbol::TensorShapeOrDataDimExprs candidate_shape = inputs_shape.front();
-  size_t candidate_idx = 0;
   for (size_t i = 1; i < inputs_shape.size(); ++i) {
     // 0D tensor
     if (inputs_shape[i].shape().size() == 0) {
       continue;
     }
     if (candidate_shape.shape().size() == 0) {
       candidate_shape = inputs_shape[i];
-      candidate_idx = i;
       continue;
     }
-    PADDLE_ENFORCE_EQ(candidate_shape,
-                      inputs_shape[i],
-                      common::errors::InvalidArgument(
-                          "The input tensor X of AddNOp must"
-                          " have same shape. But received X[%d]'s shape = "
-                          "[%s], X[%d]'s shape = [%s].",
-                          candidate_idx,
-                          candidate_shape,
-                          i,
-                          inputs_shape[i]));
+    for (size_t j = 0; j < candidate_shape.shape().size(); ++j) {
+      infer_context->AddEqualCstr(candidate_shape.shape()[j],
+                                  inputs_shape[i].shape()[j]);
+    }
   }
   infer_context->SetShapeOrDataForValue(
       op->result(0), symbol::ShapeOrDataDimExprs{candidate_shape});

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -822,6 +822,15 @@ XPUOpMap& get_kl2_ops() {
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"save",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -668,7 +668,9 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT16})},
       {"mean_grad",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"mean",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -828,6 +830,15 @@ XPUOpMap& get_kl3_ops() {
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"save",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,

diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
@@ -68,7 +68,6 @@ DECLARE_ACTIVATION_KERNEL(Log)
 DECLARE_ACTIVATION_KERNEL(Log2)
 DECLARE_ACTIVATION_KERNEL(Log10)
 DECLARE_ACTIVATION_KERNEL(Log1p)
-DECLARE_ACTIVATION_KERNEL(Round)
 DECLARE_ACTIVATION_KERNEL(Floor)
 DECLARE_ACTIVATION_KERNEL(Ceil)
 DECLARE_ACTIVATION_KERNEL(Negative)
@@ -98,6 +97,12 @@ void Relu6Kernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  DenseTensor* out);
 
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out);
+
 template <typename T, typename Context>
 void SwishKernel(const Context& dev_ctx,
                  const DenseTensor& x,

diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -93,7 +93,6 @@ DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Softsign, SoftsignFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Round, RoundFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Negative, NegativeFunctor)
@@ -161,6 +160,19 @@ void Relu6Kernel(const Context& dev_ctx,
   ActivationImpl<T, T, Context, funcs::Relu6Functor<T>>(
       dev_ctx, x, out, functor);
 }
+
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out) {
+  funcs::RoundFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = decimals;
+  ActivationImpl<T, T, Context, funcs::RoundFunctor<T>>(
+      dev_ctx, x, out, functor);
+}
+
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}