From 021197ed2179d6f33c394155dfb0fb044edc1847 Mon Sep 17 00:00:00 2001
From: tc20042008 <156998525+tc20042008@users.noreply.github.com>
Date: Thu, 4 Jul 2024 10:07:44 +0800
Subject: [PATCH 01/16] [CINN] Dump pir program only once (#65681)

* dump FeedOp tensor meta

* dump pir program only once

---------

Co-authored-by: jiahy0825 <jiahongyu@baidu.com>
---
 .../transforms/pir_to_py_code_converter.cc    | 90 ++++++++++++-------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
index 8ad51581c1a740..473f1c9de1b485 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pir_to_py_code_converter.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/framework/feed_hook.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
@@ -75,23 +76,34 @@ void VisitFeedName(const pir::Program& program,
                    const DoEachFeadNameT& DoEachFeadName) {
   auto module_op = program.module_op();
   const auto& block = module_op.block();
-  const auto& IsDataOp = [](const pir::Operation& op) -> bool {
-    return op.isa<paddle::dialect::DataOp>();
-  };
-  const auto& GetDataOpName = [](const pir::Operation& op) -> std::string {
+  auto GetDataOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::DataOp>()) return std::nullopt;
     return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
-  const auto& IsFeedOp = [](const pir::Operation& op) -> bool {
-    return op.isa<paddle::dialect::FeedOp>();
+  auto GetFeedOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::FeedOp>()) return std::nullopt;
+    return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
-  const auto& GetFeedOpName = [](const pir::Operation& op) -> std::string {
+  auto GetPhiFeedOpName =
+      [](const pir::Operation& op) -> std::optional<std::string> {
+    if (!op.isa<paddle::dialect::PhiKernelOp>()) return std::nullopt;
+    const auto& attributes = op.attributes();
+    const auto& op_name_it = attributes.find("op_name");
+    if (op_name_it == attributes.end()) return std::nullopt;
+    const auto& op_name =
+        op_name_it->second.dyn_cast<pir::StrAttribute>().AsString();
+    if (op_name != "pd_op.feed") return std::nullopt;
     return op.attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
   };
   for (const auto& op : block) {
-    if (IsDataOp(op)) {
-      DoEachFeadName(GetDataOpName(op));
-    } else if (IsFeedOp(op)) {
-      DoEachFeadName(GetFeedOpName(op));
+    if (const auto& name = GetDataOpName(op)) {
+      DoEachFeadName(name.value());
+    } else if (const auto& name = GetFeedOpName(op)) {
+      DoEachFeadName(name.value());
+    } else if (const auto& name = GetPhiFeedOpName(op)) {
+      DoEachFeadName(name.value());
     } else {
       // Do nothing.
     }
@@ -1431,34 +1443,48 @@ std::optional<pir::ShapeConstraintIRAnalysis*> GetNullShapeAnalysis(
   return std::nullopt;
 }
 
+void TryTruncateLogginFile(const std::string& file_path) {
+  if (!FLAGS_logging_trunc_pir_py_code) return;
+  static std::mutex mutex;
+  std::unique_lock<std::mutex> lock(mutex);
+  static std::unordered_map<std::string, std::once_flag> once_flags;
+  std::call_once(once_flags[file_path], [&] {
+    std::ofstream ofs;
+    ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
+    ofs.close();
+  });
+}
+
 }  // namespace
 
 void PirToPyCodeConverter::SaveIfFlagEnabled() const {
   if (program_ == nullptr) return;
   if (file_name_.empty()) return;
-  if (FLAGS_logging_pir_py_code_dir == "") return;
+  if (FLAGS_logging_pir_py_code_dir.empty()) return;
   const std::string file_path =
       FLAGS_logging_pir_py_code_dir + "/" + file_name_;
-  ShapeAnalysisGetterT ShapeAnalysisGetter =
-      (dump_symbolic_shape_ ? GetShapeAnalysisFromManager
-                            : GetNullShapeAnalysis);
-  PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
-  const std::string content = converter_helper.Convert();
-  static std::mutex mutex;
-  std::unique_lock<std::mutex> lock(mutex);
-  if (FLAGS_logging_trunc_pir_py_code) {
-    static std::unordered_map<std::string, std::once_flag> once_flags;
-    std::call_once(once_flags[file_path], [&] {
-      std::ofstream ofs;
-      ofs.open(file_path.c_str(), std::ios::out | std::ios::trunc);
-      ofs.close();
-    });
-  }
-  std::ofstream ofs;
-  ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
-  if (!ofs.is_open()) return;
-  ofs << content << std::endl;
-  ofs.close();
+  TryTruncateLogginFile(file_path);
+  const auto MutOnceFlag = [&]() -> std::once_flag* {
+    static std::mutex mutex;
+    std::unique_lock<std::mutex> lock(mutex);
+    using FileName = std::string;
+    using FileName2OnceFlag = std::unordered_map<FileName, std::once_flag>;
+    using ProgramId = int64_t;
+    static std::unordered_map<ProgramId, FileName2OnceFlag> once_flags;
+    return &once_flags[program_->id()][file_name_];
+  };
+  std::call_once(*MutOnceFlag(), [&] {
+    ShapeAnalysisGetterT ShapeAnalysisGetter =
+        (dump_symbolic_shape_ ? GetShapeAnalysisFromManager
+                              : GetNullShapeAnalysis);
+    PirToPyCodeConverterHelper converter_helper(program_, ShapeAnalysisGetter);
+    const std::string content = converter_helper.Convert();
+    std::ofstream ofs;
+    ofs.open(file_path.c_str(), std::ios::out | std::ios::app);
+    if (!ofs.is_open()) return;
+    ofs << content << std::endl;
+    ofs.close();
+  });
 }
 
 void DumpExecProgram(const pir::Program& program,

From 5b2e91444539c39eaa921433ae976a00dce2d408 Mon Sep 17 00:00:00 2001
From: Frank Lin <eee4017@gmail.com>
Date: Thu, 4 Jul 2024 10:41:36 +0800
Subject: [PATCH 02/16] Store allocation ptr in vector (#65024)

Co-authored-by: lawrence910426 <lawu@nvidia.com>
---
 paddle/phi/kernels/funcs/segmented_array.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index 4b4b1b59db66eb..24046da52aeeeb 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -112,7 +112,7 @@ struct ArraySetterBase {
                      void* src,
                      size_t num_bytes,
                      bool use_cuda_graph = false) {
-    allocation = phi::memory_utils::Alloc(
+    auto allocation = phi::memory_utils::Alloc(
         ctx.GetPlace(),
         num_bytes,
         phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
@@ -129,10 +129,13 @@ struct ArraySetterBase {
                                        num_bytes,
                                        phi::gpuMemcpyHostToDevice,
                                        ctx.stream());
-    return allocation->ptr();
+
+    auto ptr = allocation->ptr();
+    allocations.emplace_back(std::move(allocation));
+    return ptr;
   }
 
-  phi::Allocator::AllocationPtr allocation{nullptr};
+  std::vector<phi::Allocator::AllocationPtr> allocations;
 };
 
 template <typename Context, typename T, SegmentedArraySize Size>

From f461d862eee70f824243ef45ed998166f455ba71 Mon Sep 17 00:00:00 2001
From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com>
Date: Thu, 4 Jul 2024 11:30:02 +0800
Subject: [PATCH 03/16] [XPU] mean_grad support bf16 for XPU (#65684)

---
 paddle/phi/backends/xpu/xpu3_op_list.cc           | 4 +++-
 paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index 54f56f2bd93613..e27587c8596f02 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -668,7 +668,9 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BFLOAT16,
                      phi::DataType::FLOAT16})},
       {"mean_grad",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"mean",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
diff --git a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
index 37ace904b2b807..de5b4718e98603 100644
--- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
@@ -89,4 +89,5 @@ PD_REGISTER_KERNEL(mean_grad,
                    ALL_LAYOUT,
                    phi::ReduceMeanGradKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}

From a30c8a5ab063e3e0521267fb7610823681adf7b0 Mon Sep 17 00:00:00 2001
From: Frank Lin <eee4017@gmail.com>
Date: Thu, 4 Jul 2024 11:45:06 +0800
Subject: [PATCH 04/16] Disabling Unrelated Tests When Enabling CUDA Async
 Allocator in CI (#65094)

* Either stream safe or async allocator

* Ignore if not enabled

* fix: ignore cuda managed

* fix: disable async allocator

* fix: either async or stream safe

* fix useless if

---------

Co-authored-by: lawrence910426 <lawu@nvidia.com>
---
 .../garbage_collector/garbage_collector.h     | 13 +++++++++-
 .../memory/allocation/allocator_facade.cc     | 10 +++++++
 test/cpp/fluid/memory/CMakeLists.txt          |  5 ++--
 .../memory/stream_safe_cuda_alloc_test.cu     | 26 +++++++++++++++++++
 .../test_auto_growth_allocator_gpu.py         |  2 ++
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
index 9c9b40631eaa93..6208130a67ca75 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
+++ b/paddle/fluid/framework/new_executor/garbage_collector/garbage_collector.h
@@ -53,13 +53,24 @@ inline bool IsInterpretercoreFastGCEnabled() {
   // When using cuda graph, fast GC must be used. Because
   // `EventQuery` method in event GC cannot be used in
   // cuda graph.
+  PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
+                                .IsStreamSafeCUDAAllocatorUsed() == true &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == true,
+                    false,
+                    platform::errors::InvalidArgument(
+                        "StreamSafeAllocator and AsyncAllocator shouldn't be "
+                        "True together."));
   PADDLE_ENFORCE_EQ(memory::allocation::AllocatorFacade::Instance()
                                 .IsStreamSafeCUDAAllocatorUsed() == false &&
+                        memory::allocation::AllocatorFacade::Instance()
+                                .IsCUDAMallocAsyncAllocatorUsed() == false &&
                         FLAGS_new_executor_use_cuda_graph,
                     false,
                     platform::errors::InvalidArgument(
                         "When FLAGS_new_executor_use_cuda_graph is true, "
-                        "IsStreamSafeCUDAAllocatorUsed must be true, but "
+                        "Either IsStreamSafeCUDAAllocatorUsed or "
+                        "IsCUDAMallocAsyncAllocatorUsed must be true, but "
                         "got false."));
   return (memory::allocation::AllocatorFacade::Instance()
               .IsStreamSafeCUDAAllocatorUsed() &&
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index eef6c1a1e8c4ac..b81bfd0400d99f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -264,6 +264,11 @@ class AllocatorFacadePrivate {
         // application, treating it separately can avoid lots of overhead of
         // acquiring default stream and applying read-write lock.
         if (FLAGS_use_cuda_malloc_async_allocator) {
+          PADDLE_ENFORCE_EQ(FLAGS_use_cuda_managed_memory,
+                            false,
+                            platform::errors::InvalidArgument(
+                                "Async allocator cannot be used with CUDA "
+                                "managed memory."));
           WrapCUDAMallocAsyncAllocatorForDefault();
           is_cuda_malloc_async_allocator_used_ = true;
         } else {
@@ -871,6 +876,11 @@ class AllocatorFacadePrivate {
             "the allocator strategy %d is unsupported for multi-stream",
             static_cast<int>(strategy_)));
     if (FLAGS_use_cuda_malloc_async_allocator) {
+      PADDLE_ENFORCE_EQ(
+          FLAGS_use_cuda_managed_memory,
+          false,
+          platform::errors::InvalidArgument(
+              "Async allocator cannot be used with CUDA managed memory."));
       VLOG(8) << "[CUDAMallocAsyncAllocator] Init CUDA allocator for stream "
               << stream << " in place " << p;
       InitCUDAMallocAsyncAllocator(p, stream);
diff --git a/test/cpp/fluid/memory/CMakeLists.txt b/test/cpp/fluid/memory/CMakeLists.txt
index 5bb36f73982287..a7c2e6df4411c6 100644
--- a/test/cpp/fluid/memory/CMakeLists.txt
+++ b/test/cpp/fluid/memory/CMakeLists.txt
@@ -116,8 +116,9 @@ if(WITH_TESTING AND TEST cuda_managed_memory_test)
     cuda_managed_memory_test
     PROPERTIES
       ENVIRONMENT
-      "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
-      TIMEOUT 50)
+      "FLAGS_use_cuda_managed_memory=true;FLAGS_use_cuda_malloc_async_allocator=false;FLAGS_allocator_strategy=auto_growth"
+      TIMEOUT
+      50)
 endif()
 
 if(WITH_GPU AND WITH_TESTING)
diff --git a/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu b/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu
index b0bebf5202eee2..91e896c803bec0 100644
--- a/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/test/cpp/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -33,6 +33,14 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#define RETURN_IF_NOT_ENABLED                            \
+  {                                                      \
+    if (!memory::allocation::AllocatorFacade::Instance() \
+             .IsStreamSafeCUDAAllocatorUsed()) {         \
+      return;                                            \
+    }                                                    \
+  }
+
 namespace paddle {
 namespace memory {
 
@@ -54,6 +62,8 @@ void CheckMemLeak(const platform::CUDAPlace &place) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -81,6 +91,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -104,6 +116,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   auto &instance = allocation::AllocatorFacade::Instance();
   platform::CUDAPlace place = platform::CUDAPlace();
   const std::shared_ptr<Allocator> allocator_implicit_stream =
@@ -118,6 +132,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorWithDefaultStreamTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   std::shared_ptr<Allocation> zero_size_allocation = AllocShared(place, 0);
   EXPECT_EQ(zero_size_allocation->ptr(), nullptr);
@@ -139,6 +155,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, ZeroSizeRecordStreamTest) {
 }
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   size_t alloc_size = 256;
 
@@ -176,6 +194,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
 }
 
 TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
+  RETURN_IF_NOT_ENABLED;
+
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
 #ifdef PADDLE_WITH_CUDA
@@ -403,17 +423,23 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 };
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiStreamRun();
   CheckResult();
 }
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiThreadMultiStreamRun();
   CheckResult();
 }
 
 #if (defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11000))
 TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) {
+  RETURN_IF_NOT_ENABLED;
+
   MultiStreamRun();
   CUDAGraphRun();
   CheckResult();
diff --git a/test/legacy_test/test_auto_growth_allocator_gpu.py b/test/legacy_test/test_auto_growth_allocator_gpu.py
index 3ac11c1baf86fb..c20c825032d6ac 100644
--- a/test/legacy_test/test_auto_growth_allocator_gpu.py
+++ b/test/legacy_test/test_auto_growth_allocator_gpu.py
@@ -25,6 +25,8 @@
         {
             'FLAGS_allocator_strategy': 'auto_growth',
             'FLAGS_auto_growth_chunk_size_in_mb': 10,
+            # Async allocator does not support auto growth allocator.
+            'FLAGS_use_cuda_malloc_async_allocator': 0,
         }
     )
 

From 216bfcc1b6747d1add85a4c09e25a2452c66d6ef Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Thu, 4 Jul 2024 13:51:37 +0800
Subject: [PATCH 05/16] [XPU] ut for save and load op (#65656)

---
 paddle/fluid/operators/save_op.cc       |  17 ++++
 paddle/phi/backends/xpu/xpu2_op_list.cc |   9 ++
 paddle/phi/backends/xpu/xpu3_op_list.cc |   9 ++
 test/cpp/fluid/CMakeLists.txt           |   3 +
 test/cpp/fluid/save_load_op_test_xpu.cc | 123 ++++++++++++++++++++++++
 5 files changed, 161 insertions(+)
 create mode 100644 test/cpp/fluid/save_load_op_test_xpu.cc

diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index f025d278074215..8b0f0eb45ffa5c 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -105,6 +105,23 @@ PD_REGISTER_KERNEL(save,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
+#ifdef PADDLE_WITH_XPU
+PD_REGISTER_KERNEL(save,
+                   XPU,
+                   ALL_LAYOUT,
+                   ops::SaveKernel,
+                   float,
+                   double,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif
+
 PD_REGISTER_KERNEL(save_sr,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 5a371aa14116ed..a5681c7eaeef19 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -822,6 +822,15 @@ XPUOpMap& get_kl2_ops() {
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"save",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index e27587c8596f02..f3abe1726a5053 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -830,6 +830,15 @@ XPUOpMap& get_kl3_ops() {
       {"roll_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt", XPUKernelSet({phi::DataType::FLOAT32})},
       {"rsqrt_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"save",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT64,
+                     phi::DataType::INT32,
+                     phi::DataType::UINT8,
+                     phi::DataType::INT8,
+                     phi::DataType::INT64,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"scale",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 17d71d85c0d00a..a6b6ce43dfb7e3 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -28,6 +28,9 @@ paddle_test(assign_op_test SRCS assign_op_test.cc)
 paddle_test(scatter_test SRCS scatter_test.cc DEPS common)
 paddle_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc)
 paddle_test(save_load_op_test SRCS save_load_op_test.cc)
+if(WITH_XPU)
+  paddle_test(save_load_op_test_xpu SRCS save_load_op_test_xpu.cc)
+endif()
 paddle_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc)
 if(WITH_CINN)
   set(CINN_DEPS python)
diff --git a/test/cpp/fluid/save_load_op_test_xpu.cc b/test/cpp/fluid/save_load_op_test_xpu.cc
new file mode 100644
index 00000000000000..9541889c7e0c10
--- /dev/null
+++ b/test/cpp/fluid/save_load_op_test_xpu.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+template <typename Place, typename T>
+int SaveLoadOpTest(Place place, int dim_1, int dim_2) {
+  // use cpu place for ground truth
+  paddle::platform::CPUPlace cpu_place;
+  std::vector<T> ground_truth_cpu(dim_1 * dim_2);
+  for (int i = 0; i < dim_1 * dim_2; i++) {
+    ground_truth_cpu[i] = static_cast<T>(i);
+  }
+
+  // scope, var, tensor and lod
+  paddle::framework::Scope scope;
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<phi::DenseTensor>();
+  tensor->Resize({dim_1, dim_2});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  for (int i = 0; i < dim_1; i++) {
+    expect_lod[0].push_back(i);
+  }
+  tensor->set_lod(expect_lod);
+  T* src_mutable = tensor->mutable_data<T>(place);
+  // copy cpu data to tensor
+  paddle::memory::Copy(place,
+                       src_mutable,
+                       cpu_place,
+                       ground_truth_cpu.data(),
+                       sizeof(T) * ground_truth_cpu.size());
+
+  // run save op
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  // result var and tensor
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<phi::DenseTensor>();
+
+  // run load op
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+
+  // copy result tensor data to cpu
+  T* actual = target->data<T>();
+  std::vector<T> actual_cpu(dim_1 * dim_2);
+  paddle::memory::Copy(cpu_place,
+                       actual_cpu.data(),
+                       place,
+                       actual,
+                       sizeof(T) * ground_truth_cpu.size());
+
+  // check result: data
+  for (int i = 0; i < dim_1 * dim_2; i++) {
+    if (actual_cpu[i] != ground_truth_cpu[i]) {
+      return 1;
+    }
+  }
+
+  // check result: lod
+  auto& actual_lod = target->lod();
+  if (expect_lod.size() != actual_lod.size()) {
+    return 1;
+  }
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      if (expect_lod[i][j] != actual_lod[i][j]) {
+        return 1;
+      }
+    }
+  }
+  return 0;
+}
+
+TEST(SaveLoadOp, XPU) {
+  paddle::platform::XPUPlace xpu_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  int r = 0;
+
+  r = SaveLoadOpTest<paddle::platform::XPUPlace, float>(xpu_place, 3, 10);
+  EXPECT_EQ(r, 0);
+  r = SaveLoadOpTest<paddle::platform::CPUPlace, float>(cpu_place, 3, 10);
+  EXPECT_EQ(r, 0);
+
+  r = SaveLoadOpTest<paddle::platform::XPUPlace, int>(xpu_place, 2, 128);
+  EXPECT_EQ(r, 0);
+  r = SaveLoadOpTest<paddle::platform::CPUPlace, int>(cpu_place, 2, 128);
+  EXPECT_EQ(r, 0);
+
+  r = SaveLoadOpTest<paddle::platform::XPUPlace, paddle::platform::float16>(
+      xpu_place, 2, 128);
+  EXPECT_EQ(r, 0);
+  r = SaveLoadOpTest<paddle::platform::CPUPlace, paddle::platform::float16>(
+      cpu_place, 2, 128);
+  EXPECT_EQ(r, 0);
+
+  r = SaveLoadOpTest<paddle::platform::XPUPlace, paddle::platform::bfloat16>(
+      xpu_place, 4, 32);
+  EXPECT_EQ(r, 0);
+  r = SaveLoadOpTest<paddle::platform::CPUPlace, paddle::platform::bfloat16>(
+      cpu_place, 4, 32);
+  EXPECT_EQ(r, 0);
+}

From 5884585f165ace589b2b9051ae317f67c88ca3bf Mon Sep 17 00:00:00 2001
From: Hongqing-work <76149632+Hongqing-work@users.noreply.github.com>
Date: Thu, 4 Jul 2024 14:12:56 +0800
Subject: [PATCH 06/16] [CINN]revert of  move ShapeOptimization before PdToCinn
 (#65675)

---
 paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 56db312570df6d..e9c7bbb41c3305 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -246,8 +246,10 @@ void ApplyCinnPass(::pir::Program* program,
       .file_name("original_programs.py")
       .dump_symbolic_shape(FLAGS_logging_pir_py_code_dump_symbolic_dims)
       .SaveIfFlagEnabled();
-  ApplyShapeOptimizationPass(program, CreatePassManager);
   ApplyPdToCinnPass(program, CreatePassManager);
+  // TODO(Hongqing-work): move ApplyShapeOptimizationPass before
+  // ApplyPdToCinnPass after fixing infer shape bug.
+  ApplyShapeOptimizationPass(program, CreatePassManager);
   ApplyCinnPreprocessPass(program, CreatePassManager);
   ApplyBuildGroupOpPass(program, CreatePassManager);
   PirToPyCodeConverter(program)

From 091044b7ddecb851f2a02e680549ac296f1afc64 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 4 Jul 2024 14:21:55 +0800
Subject: [PATCH 07/16] inference use FLAGS_enable_pir_api control pir mode
 (#65596)

* inference use FLAGS_enable_pir_api control pir mode

* fix ut

* fix
---
 paddle/fluid/inference/api/analysis_predictor.cc |  5 +++++
 test/cpp/jit/layer_test.cc                       | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 948a4b24f29c71..2f43ae37d4d8e6 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -139,6 +139,7 @@
 #include "paddle/pir/include/pass/pass_registry.h"
 
 COMMON_DECLARE_bool(pir_apply_inplace_pass);
+COMMON_DECLARE_bool(enable_pir_api);
 
 namespace paddle {
 namespace {
@@ -390,6 +391,10 @@ AnalysisPredictor::AnalysisPredictor(const AnalysisConfig &config)
   if (config_.shape_range_info_collected()) {
     config_.SwitchIrOptim(false);
   }
+  if (FLAGS_enable_pir_api) {
+    config_.EnableNewExecutor(true);
+    config_.EnableNewIR(true);
+  }
   if (config_.new_executor_enabled()) {
     config_.EnableMemoryOptim(false);
     if (config_.new_ir_enabled()) {
diff --git a/test/cpp/jit/layer_test.cc b/test/cpp/jit/layer_test.cc
index 42fd976f6dbdd3..57c7bd9dedfbd9 100644
--- a/test/cpp/jit/layer_test.cc
+++ b/test/cpp/jit/layer_test.cc
@@ -55,6 +55,8 @@ PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
 #endif
 
+COMMON_DECLARE_bool(enable_pir_api);
+
 namespace paddle {
 namespace jit {
 using DenseTensor = phi::DenseTensor;
@@ -77,6 +79,9 @@ TEST(CpuLayerTest, Function) {
 }
 
 TEST(CpuLayerTest, Construct) {
+  if (FLAGS_enable_pir_api) {
+    return;
+  }
   auto place = phi::CPUPlace();
   std::string path = "./multi_program_load/export";
   paddle::platform::Timer timer;
@@ -125,6 +130,9 @@ TEST(CpuLayerTest, Construct) {
 }
 
 TEST(CpuLayerTest, Clone) {
+  if (FLAGS_enable_pir_api) {
+    return;
+  }
   auto place = phi::CPUPlace();
   std::string path = "./multi_program_load/export";
 
@@ -161,6 +169,9 @@ TEST(CpuLayerTest, Clone) {
 
 #if defined(PADDLE_WITH_CUDA)
 TEST(GpuLayerTest, Construct) {
+  if (FLAGS_enable_pir_api) {
+    return;
+  }
   auto place = phi::GPUPlace();
 
   std::string path = "./multi_program_load/export";
@@ -189,6 +200,9 @@ TEST(GpuLayerTest, Construct) {
 }
 
 TEST(GpuLayerTest, Clone) {
+  if (FLAGS_enable_pir_api) {
+    return;
+  }
   auto place = phi::GPUPlace();
 
   std::string path = "./multi_program_load/export";

From 218e62d8acf1167bccc2fee1e9097f269961a5e6 Mon Sep 17 00:00:00 2001
From: Botao Zhou <1095497213@qq.com>
Date: Thu, 4 Jul 2024 14:30:27 +0800
Subject: [PATCH 08/16] =?UTF-8?q?=E3=80=90Hackathon=206th=20No.28=E3=80=91?=
 =?UTF-8?q?=E4=B8=BA=20paddle.round=20=E8=BF=9B=E8=A1=8C=E5=8A=9F=E8=83=BD?=
 =?UTF-8?q?=E5=A2=9E=E5=BC=BA=20-part=20(#64436)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add decimals for round

* set defalut value

* fix

* fix round inplace

* add round inplace func

* empty

* fix round  on onednn

* fix

* remove redundant comments

* re-run

* change calculation process

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fix bug

* fix coverage

* add attr in yaml file
---
 paddle/phi/kernels/activation_kernel.h        |  7 +++-
 paddle/phi/kernels/cpu/activation_kernel.cc   | 14 +++++++-
 paddle/phi/kernels/funcs/activation_functor.h | 33 +++++++++++++++--
 paddle/phi/kernels/gpu/activation_kernel.cu   | 14 +++++++-
 .../phi/kernels/onednn/activation_kernel.cc   | 26 ++++++++++++--
 paddle/phi/ops/yaml/backward.yaml             |  2 +-
 paddle/phi/ops/yaml/op_version.yaml           |  8 +++++
 paddle/phi/ops/yaml/ops.yaml                  |  3 +-
 python/paddle/tensor/ops.py                   | 22 +++++++++---
 test/legacy_test/test_activation_op.py        | 36 +++++++++++++++++--
 10 files changed, 150 insertions(+), 15 deletions(-)

diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index bf3cb325160d36..4e94260bc6d129 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -68,7 +68,6 @@ DECLARE_ACTIVATION_KERNEL(Log)
 DECLARE_ACTIVATION_KERNEL(Log2)
 DECLARE_ACTIVATION_KERNEL(Log10)
 DECLARE_ACTIVATION_KERNEL(Log1p)
-DECLARE_ACTIVATION_KERNEL(Round)
 DECLARE_ACTIVATION_KERNEL(Floor)
 DECLARE_ACTIVATION_KERNEL(Ceil)
 DECLARE_ACTIVATION_KERNEL(Negative)
@@ -98,6 +97,12 @@ void Relu6Kernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  DenseTensor* out);
 
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out);
+
 template <typename T, typename Context>
 void SwishKernel(const Context& dev_ctx,
                  const DenseTensor& x,
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index fda8493c9f4523..22b63e6a0ecdf9 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -93,7 +93,6 @@ DEFINE_CPU_ACTIVATION_KERNEL(Rsqrt, RsqrtFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Softsign, SoftsignFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
-DEFINE_CPU_ACTIVATION_KERNEL(Round, RoundFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Floor, FloorFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Ceil, CeilFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Negative, NegativeFunctor)
@@ -161,6 +160,19 @@ void Relu6Kernel(const Context& dev_ctx,
   ActivationImpl<T, T, Context, funcs::Relu6Functor<T>>(
       dev_ctx, x, out, functor);
 }
+
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out) {
+  funcs::RoundFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = decimals;
+  ActivationImpl<T, T, Context, funcs::RoundFunctor<T>>(
+      dev_ctx, x, out, functor);
+}
+
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
 
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 8502c385f7b531..13e6bf0471c2d3 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -2949,9 +2949,23 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
+  int decimals;
+
+  std::vector<std::pair<const char*, int*>> GetAttrs() {
+    return {{"deciamls", &decimals}};
+  }
+
   template <typename Device, typename X, typename Out>
   void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.round();
+    if (decimals == 0) {
+      out.device(d) = x.round();
+    } else if (decimals > 0) {
+      auto ten_pow_deciamls = static_cast<T>(std::pow(10, decimals));
+      out.device(d) = (x * ten_pow_deciamls).round() / ten_pow_deciamls;
+    } else {
+      auto ten_pow_deciamls = static_cast<T>(std::pow(10, -decimals));
+      out.device(d) = (x / ten_pow_deciamls).round() * ten_pow_deciamls;
+    }
   }
 };
 
@@ -5161,11 +5175,26 @@ struct CudaFloorFunctor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaRoundFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  int decimals;
 
+  std::vector<std::pair<const char*, int*>> GetAttrs() {
+    return {{"deciamls", &decimals}};
+  }
   // round(x) = round(x)
   __device__ __forceinline__ T operator()(const T arg_x) const {
     MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(round(x));
+
+    if (decimals == 0) {
+      return static_cast<T>(round(x));
+    } else if (decimals > 0) {
+      float ten_pow_deciamls = powf(10., decimals);
+      return static_cast<T>(round(x * static_cast<MPType>(ten_pow_deciamls)) /
+                            ten_pow_deciamls);
+    } else {
+      float ten_pow_deciamls = powf(10., -decimals);
+      return static_cast<T>(round(x / static_cast<MPType>(ten_pow_deciamls)) *
+                            ten_pow_deciamls);
+    }
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index aa874c5e0dd81c..0ad0cb9f8c8f6c 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -110,7 +110,6 @@ DEFINE_GPU_ACTIVATION_KERNEL(Rsqrt, CudaRsqrtFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Softsign, CudaSoftsignFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Round, CudaRoundFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Floor, CudaFloorFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Ceil, CudaCeilFunctor)
 
@@ -187,6 +186,19 @@ void Relu6Kernel(const Context& dev_ctx,
   ActivationGPUImpl<T, Context, funcs::CudaRelu6Functor<T>>(
       dev_ctx, x, out, functor);
 }
+
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out) {
+  funcs::CudaRoundFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = decimals;
+  ActivationGPUImpl<T, Context, funcs::CudaRoundFunctor<T>>(
+      dev_ctx, x, out, functor);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc
index a4757eab71c41e..247f2df5140d1b 100644
--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
@@ -21,7 +21,6 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
-
 namespace phi {
 
 #define DEFINE_ONEDNN_ACTIVATION_KERNEL(name, functor_class)            \
@@ -149,7 +148,30 @@ DEFINE_ONEDNN_ACTIVATION_KERNEL(Sqrt, SqrtOneDNNFunctor)
 DEFINE_ONEDNN_ACTIVATION_KERNEL(Tanh, TanhOneDNNFunctor)
 
 // round eltwise primitive doesn't support BF16, nor does it support grad
-DEFINE_ONEDNN_ACTIVATION_KERNEL(Round, RoundOneDNNFunctor)
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const int decimals,
+                 DenseTensor* out) {
+  float ten_pow_deciamls = std::pow(10, decimals);
+
+  DenseTensor out1;
+  DenseTensorMeta meta_out(x.dtype(), x.dims());
+  out1.set_meta(meta_out);
+  out1.set_lod(x.lod());
+  out1.set_mem_desc(x.mem_desc());
+  dev_ctx.template Alloc<T>(&out1);
+
+  for (int i = 0; i < x.numel(); i++) {
+    out1.data<T>()[i] = x.data<T>()[i] * ten_pow_deciamls;
+  }
+  RoundOneDNNFunctor<T> functor;
+  functor(dev_ctx, out1, 0, 0, out);
+
+  for (int i = 0; i < x.numel(); i++) {
+    out->data<T>()[i] = out->data<T>()[i] * (1 / ten_pow_deciamls);
+  }
+}
 
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluOneDNNFunctor, alpha)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, ReluOneDNNFunctor, alpha)
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
index ac445e5e486d39..51175fbe6422e9 100644
--- a/paddle/phi/ops/yaml/backward.yaml
+++ b/paddle/phi/ops/yaml/backward.yaml
@@ -2609,7 +2609,7 @@
   no_need_buffer : x
 
 - backward_op : round_grad
-  forward : round(Tensor x) -> Tensor(out)
+  forward : round(Tensor x, int decimals = 0 ) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
   infer_meta :
diff --git a/paddle/phi/ops/yaml/op_version.yaml b/paddle/phi/ops/yaml/op_version.yaml
index b6081079c4a328..3f4c8c20b414d8 100644
--- a/paddle/phi/ops/yaml/op_version.yaml
+++ b/paddle/phi/ops/yaml/op_version.yaml
@@ -494,6 +494,14 @@
         - add_input : ShiftsTensor
           comment : The number of places by which the elements of the tensor are shifted.
 
+- op : round
+  version :
+    - checkpoint : Add a new attribute [deciamls] to round
+      action :
+        - add_attr : decimals
+          comment : The number of decimal places rounded
+          default : 0.0
+
 - op : softmax_with_cross_entropy
   version :
     - checkpoint : Add a new attribute [use_softmax]
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
index 50ec6fb78a97d7..c76cd20a149747 100755
--- a/paddle/phi/ops/yaml/ops.yaml
+++ b/paddle/phi/ops/yaml/ops.yaml
@@ -3788,10 +3788,11 @@
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : round
-  args : (Tensor x)
+  args : (Tensor x, int decimals = 0 )
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : round
   inplace : (x -> out)
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index b28d46fea5aee9..5439b8eb1a4a4c 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -35,7 +35,6 @@
     'rsqrt_',
     'ceil_',
     'floor_',
-    'round_',
     'reciprocal_',
     'sigmoid_',
     'abs_',
@@ -687,7 +686,7 @@ def reciprocal(x: Tensor, name: str | None = None) -> Tensor:
         return out
 
 
-def round(x: Tensor, name: str | None = None) -> Tensor:
+def round(x: Tensor, decimals: int = 0, name: str | None = None) -> Tensor:
     """
 
     Round the values in the input to the nearest integer value.
@@ -704,6 +703,7 @@ def round(x: Tensor, name: str | None = None) -> Tensor:
 
     Args:
         x (Tensor): Input of Round operator, an N-D Tensor, with data type float32, float64 or float16.
+        decimals(int): Rounded decimal place (default: 0).
         name (str|None, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -721,17 +721,31 @@ def round(x: Tensor, name: str | None = None) -> Tensor:
             [-1., -0.,  1.,  2.])
     """
     if in_dynamic_or_pir_mode():
-        return _C_ops.round(x)
+        return _C_ops.round(x, decimals)
     else:
         check_variable_and_dtype(
             x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'round'
         )
         helper = LayerHelper('round', **locals())
+        attrs = {
+            'decimals': int(decimals),
+        }
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type='round', inputs={"X": x}, outputs={"Out": out})
+        helper.append_op(
+            type='round', inputs={"X": x}, outputs={"Out": out}, attrs=attrs
+        )
         return out
 
 
+@inplace_apis_in_dygraph_only
+def round_(x, decimals=0, name=None):
+    r"""
+    Inplace version of ``round`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_paddle_round`.
+    """
+    return _C_ops.round_(x, decimals)
+
+
 def rsqrt(x: Tensor, name: str | None = None) -> Tensor:
     """
     Rsqrt Activation Operator.
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 1760f6d2ff2692..fb679e7091bff9 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -2593,18 +2593,23 @@ def setUp(self):
         self.python_api = paddle.round
         self.init_dtype()
         self.init_shape()
+        self.init_decimals()
 
         np.random.seed(1024)
-        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
-        out = np.round(x)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) * 100
+        out = np.round(x, decimals=self.decimals)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
+        self.attrs = {'decimals': self.decimals}
         self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
 
+    def init_decimals(self):
+        self.decimals = 0
+
     def test_check_output(self):
         self.check_output(
             check_pir=True, check_pir_onednn=self.check_pir_onednn
@@ -2619,6 +2624,33 @@ def init_shape(self):
         self.shape = []
 
 
+class TestRound_decimals1(TestRound):
+    def init_decimals(self):
+        self.decimals = 2
+
+    def test_round_api(self):
+        with dynamic_guard():
+            for device in devices:
+                if device == 'cpu' or (
+                    device == 'gpu' and paddle.is_compiled_with_cuda()
+                ):
+                    x_np = (
+                        np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+                        * 100
+                    )
+                    out_expect = np.round(x_np, decimals=self.decimals)
+                    x_paddle = paddle.to_tensor(
+                        x_np, dtype=self.dtype, place=device
+                    )
+                    y = paddle.round(x_paddle, decimals=self.decimals)
+                    np.testing.assert_allclose(y.numpy(), out_expect, rtol=1e-3)
+
+
+class TestRound_decimals2(TestRound_decimals1):
+    def init_decimals(self):
+        self.decimals = -1
+
+
 class TestRelu(TestActivation):
     def setUp(self):
         self.op_type = "relu"

From 90a67e8473ec6514790466e6830c554b5e074e16 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:08:26 +0800
Subject: [PATCH 09/16] [Inference] Refine global search optimization for
 cuBLASLt and apply it in INT8 GEMM. (#65597)

* [Inference] Refine global search optimization for cuBLASLt and apply it in INT8 GEMM
---
 paddle/common/flags.cc                        |  12 +
 .../kernels/funcs/blas/blaslt_gemm_search.h   | 701 +++++++++++++++++
 .../phi/kernels/funcs/blas/blaslt_impl.cu.h   | 153 +++-
 .../fp8_gemm_with_cublasLt/cublaslt_gemm.h    | 710 +-----------------
 4 files changed, 871 insertions(+), 705 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h

diff --git a/paddle/common/flags.cc b/paddle/common/flags.cc
index bf119fb1fff119..253c1a266e2ddb 100644
--- a/paddle/common/flags.cc
+++ b/paddle/common/flags.cc
@@ -1758,6 +1758,18 @@ PHI_DEFINE_EXPORTED_string(
     "If default, "
     "dlopen will search mkl from LD_LIBRARY_PATH");
 
+/**
+ * Apply global search in blaslt FLAG
+ * Name: enable_blaslt_global_search
+ * Since Version: 3.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, will apply global search in blaslt.
+ */
+PHI_DEFINE_EXPORTED_bool(enable_blaslt_global_search,
+                         false,
+                         "Whether to use global search in blaslt.");
+
 PHI_DEFINE_EXPORTED_string(op_dir,  // NOLINT
                            "",
                            "Specify path for loading user-defined op library.");
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
new file mode 100644
index 00000000000000..92166603f6940b
--- /dev/null
+++ b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
@@ -0,0 +1,701 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/phi/backends/dynload/cublasLt.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+#include "paddle/phi/api/include/context_pool.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+
+namespace phi {
+namespace funcs {
+namespace cublaslt_internal {
+
+const std::array<int, 9> split_k_candidates = {2, 3, 4, 5, 6, 8, 12, 16, 32};
+
+struct CublasLtAlgoSelectorParam {
+  cublasLtMatmulAlgo_t algo;
+  int m;
+  int n;
+  int k;
+  int algo_id;
+  int swizzle;
+  int custom_option;
+  int tile;
+  int split_k_val;
+  int reduction_scheme;
+  int stages;
+  void* workspace;
+  size_t workspace_size;
+  float time;
+};
+
+inline bool compare_algo_time(const CublasLtAlgoSelectorParam& param_a,
+                              const CublasLtAlgoSelectorParam& param_b) {
+  return (param_a.time < param_b.time);
+}
+
+class CublasLtAlgoCache {
+ public:
+  static CublasLtAlgoCache& Instance() {
+    static CublasLtAlgoCache instance(100);
+    return instance;
+  }
+
+  template <typename InT, typename OutT>
+  void RunAndMeasureAlgo(cublasLtHandle_t handle,
+                         cublasLtMatmulDesc_t matmul_desc,
+                         cublasLtMatrixLayout_t a_desc,
+                         cublasLtMatrixLayout_t b_desc,
+                         cublasLtMatrixLayout_t bias_desc,
+                         cublasLtMatrixLayout_t c_desc,
+                         void* alpha,
+                         void* beta,
+                         const InT* a,
+                         const InT* b,
+                         const OutT* bias,
+                         OutT* c,
+                         CublasLtAlgoSelectorParam& param,  // NOLINT
+                         cudaEvent_t& start_event,          // NOLINT
+                         cudaEvent_t& stop_event,           // NOLINT
+                         cudaStream_t stream) {
+    cublasStatus_t status;
+    cublasLtMatmulHeuristicResult_t heuristic_result;
+    status = dynload::cublasLtMatmulAlgoCheck(handle,
+                                              matmul_desc,
+                                              a_desc,
+                                              b_desc,
+                                              bias_desc,
+                                              c_desc,
+                                              &param.algo,
+                                              &heuristic_result);
+    PADDLE_ENFORCE_GPU_SUCCESS(status);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      param.time = std::numeric_limits<float>::max();
+      return;
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
+    int repeats = search_times_;
+
+    for (int loop = 0; loop < repeats; loop++) {
+      status = dynload::cublasLtMatmul(handle,
+                                       matmul_desc,
+                                       alpha,
+                                       a,
+                                       a_desc,
+                                       b,
+                                       b_desc,
+                                       beta,
+                                       bias,
+                                       bias_desc,
+                                       c,
+                                       c_desc,
+                                       &param.algo,
+                                       param.workspace,
+                                       param.workspace_size,
+                                       stream);
+      if (status != CUBLAS_STATUS_SUCCESS) {
+        param.time = std::numeric_limits<float>::max();
+        return;
+      }
+    }
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+
+    float time;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventElapsedTime(&time, start_event, stop_event));
+
+    param.time = time / repeats;
+  }
+
+  template <typename InT, typename OutT>
+  cublasLtMatmulAlgo_t* CublasLtAlgoSelect(cublasLtHandle_t handle,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           int batch_count,
+                                           const InT* a,
+                                           const InT* b,
+                                           const OutT* bias,
+                                           OutT* c,
+                                           void* alpha,
+                                           void* beta,
+                                           cublasLtMatmulDesc_t matmul_desc,
+                                           cublasLtMatrixLayout_t a_desc,
+                                           cublasLtMatrixLayout_t b_desc,
+                                           cublasLtMatrixLayout_t bias_desc,
+                                           cublasLtMatrixLayout_t c_desc,
+                                           cublasComputeType_t compute_type,
+                                           cudaDataType_t scale_type,
+                                           cudaDataType_t a_type,
+                                           cudaDataType_t b_type,
+                                           cudaDataType_t bias_type,
+                                           cudaDataType_t c_type,
+                                           cudaStream_t stream) {
+    // If we don't have config file and we donot search, here return nullptr
+    if (!has_config_file_ && search_times_ <= 0) {
+      return nullptr;
+    }
+
+    // VLOG(0) << "m n k: " << m << " " << n << " " << k;
+
+    int64_t seed = 0;
+    std::hash<int64_t> hash_fn;
+
+    HashMatmulDesc(matmul_desc, &seed, hash_fn);
+    HashMatrixLayoutDesc(a_desc, &seed, hash_fn);
+    HashMatrixLayoutDesc(b_desc, &seed, hash_fn);
+    HashMatrixLayoutDesc(bias_desc, &seed, hash_fn);
+    HashMatrixLayoutDesc(c_desc, &seed, hash_fn);
+
+    cublasLtMatmulAlgo_t ret;
+    {
+      std::lock_guard<std::mutex> lock(cache_mutex_);
+      auto it = map_.find(seed);
+      if (it != map_.end()) {
+        VLOG(3) << "CublasLtAlgoSelect Found in cache";
+        return &(it->second);
+      } else {
+        // if we have cache but not found algo, and we don't want to search,
+        // here return nullptr
+        if (search_times_ <= 0) {
+          return nullptr;
+        }
+      }
+    }
+    VLOG(3) << "CublasLtAlgoSelect Not Found in cache";
+
+    // Get Ids
+    // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoGetIds
+    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+    // std::vector<int> algo_ids(requested_algo_count_);
+    int algo_ids[requested_algo_count_];  // NOLINT
+
+    int num_algo_ids;
+    status = dynload::cublasLtMatmulAlgoGetIds(handle,
+                                               compute_type,
+                                               scale_type,
+                                               a_type,
+                                               b_type,
+                                               bias_type,
+                                               c_type,
+                                               requested_algo_count_,
+                                               algo_ids,
+                                               &num_algo_ids);
+    PADDLE_ENFORCE_GPU_SUCCESS(status);
+
+    // Traverse all posssible algo combinations
+    int step = 0;
+    int limit = 20000;
+    std::vector<CublasLtAlgoSelectorParam> params;
+
+    for (int idx = 0; idx < num_algo_ids; idx++) {
+      cublasLtMatmulAlgo_t algo;
+
+      /* Initialize algo structure with given Algp ID */
+      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoInit
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoInit(handle,
+                                                                 compute_type,
+                                                                 scale_type,
+                                                                 a_type,
+                                                                 b_type,
+                                                                 bias_type,
+                                                                 c_type,
+                                                                 algo_ids[idx],
+                                                                 &algo));
+
+      // Query the tiles enums supported by that algo which is used to alloc
+      // enough space to store it
+      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCapGetAttribute
+      size_t attr_size = 0;
+
+      int batch_support;
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo,
+          CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT,
+          &batch_support,
+          sizeof(batch_support),
+          &attr_size));
+      if (batch_count > 1 && batch_support == 0) {
+        continue;
+      }
+
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo, CUBLASLT_ALGO_CAP_TILE_IDS, nullptr, 0, &attr_size));
+
+      int num_tiles = static_cast<int>(attr_size / sizeof(int));
+      std::vector<int> tiles(num_tiles == 0 ? 1 : num_tiles);
+      if (num_tiles == 0) {
+        tiles[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
+        num_tiles = 1;
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+            &algo,
+            CUBLASLT_ALGO_CAP_TILE_IDS,
+            tiles.data(),
+            sizeof(int) * num_tiles,
+            &attr_size));
+      }
+
+      // Query the stages enums supported by that algo (cuda must >= 11.0)
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, nullptr, 0, &attr_size));
+      int num_stages = static_cast<int>(attr_size / sizeof(int));
+      std::vector<int> stages(num_stages == 0 ? 1 : num_stages);
+      if (num_stages == 0) {
+        stages[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
+        num_stages = 1;
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+            &algo,
+            CUBLASLT_ALGO_CAP_STAGES_IDS,
+            stages.data(),
+            sizeof(int) * num_stages,
+            &attr_size));
+      }
+
+      // Retrieve Other Algo Capabilities attributes
+      int splitk_support, red_mask, swizzling_max, custom_option_max;
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo,
+          CUBLASLT_ALGO_CAP_SPLITK_SUPPORT,
+          &splitk_support,
+          sizeof(splitk_support),
+          &attr_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo,
+          CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK,
+          &red_mask,
+          sizeof(red_mask),
+          &attr_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo,
+          CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT,
+          &swizzling_max,
+          sizeof(swizzling_max),
+          &attr_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
+          &algo,
+          CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX,
+          &custom_option_max,
+          sizeof(custom_option_max),
+          &attr_size));
+
+      /* Loop over the different tiles */
+      for (int tile_id = 0; tile_id < num_tiles && step < limit; tile_id++) {
+        /* Loop over different stages count */
+        for (int stage_id = 0; stage_id < num_stages && step < limit;
+             stage_id++) {
+          /* Loop over the different custom option if any */
+          for (int custom_option = 0;
+               custom_option <= custom_option_max && step < limit;
+               custom_option++) {
+            /* Loop over the CTAs swizzling support */
+            for (int k = 0; k <= swizzling_max && step < limit; k++) {
+              int splir_k_trial = 0;
+              if (splitk_support) {
+                splir_k_trial +=
+                    sizeof(split_k_candidates) / sizeof(split_k_candidates[0]);
+              }
+
+              for (int l = 0; (l < (1 + splir_k_trial)) && (step < limit);
+                   l++) {
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_TILE_ID,
+                        &tiles[tile_id],
+                        sizeof(tiles[tile_id])));
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_STAGES_ID,
+                        &stages[stage_id],
+                        sizeof(stages[stage_id])));
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
+                        &custom_option,
+                        sizeof(custom_option)));
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
+                        &k,
+                        sizeof(k)));
+                int split_k_val = 1;
+                int reduction_scheme = CUBLASLT_REDUCTION_SCHEME_NONE;
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                        &split_k_val,
+                        sizeof(split_k_val)));
+                PADDLE_ENFORCE_GPU_SUCCESS(
+                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                        &algo,
+                        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                        &reduction_scheme,
+                        sizeof(int)));
+                if (l > 0) {  // Split-K case
+                  split_k_val = split_k_candidates[l - 1];
+                  PADDLE_ENFORCE_GPU_SUCCESS(
+                      dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                          &algo,
+                          CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                          &split_k_candidates[l - 1],
+                          sizeof(split_k_candidates[l - 1])));
+                  for (reduction_scheme = 1;
+                       reduction_scheme <
+                           static_cast<int>(CUBLASLT_REDUCTION_SCHEME_MASK) &&
+                       (step < limit);
+                       reduction_scheme = reduction_scheme << 1) {
+                    if (reduction_scheme & red_mask) {
+                      PADDLE_ENFORCE_GPU_SUCCESS(
+                          dynload::cublasLtMatmulAlgoConfigSetAttribute(
+                              &algo,
+                              CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+                              &reduction_scheme,
+                              sizeof(reduction_scheme)));
+
+                      cublasLtMatmulHeuristicResult_t heurResult;
+                      status = dynload::cublasLtMatmulAlgoCheck(handle,
+                                                                matmul_desc,
+                                                                a_desc,
+                                                                b_desc,
+                                                                bias_desc,
+                                                                c_desc,
+                                                                &algo,
+                                                                &heurResult);
+                      if (status == CUBLAS_STATUS_SUCCESS) {
+                        size_t temp_storage_bytes = heurResult.workspaceSize;
+                        auto d_temp_storage = phi::memory_utils::Alloc(
+                            phi::GPUPlace(
+                                phi::backends::gpu::GetCurrentDeviceId()),
+                            temp_storage_bytes);
+
+                        CublasLtAlgoSelectorParam algo_select_params;
+                        algo_select_params.algo = algo;
+                        algo_select_params.m = m;
+                        algo_select_params.n = n;
+                        algo_select_params.k = k;
+                        algo_select_params.algo_id = algo_ids[idx];
+                        algo_select_params.tile = tiles[tile_id];
+                        algo_select_params.swizzle = k;
+                        algo_select_params.custom_option = custom_option;
+                        algo_select_params.split_k_val = split_k_val;
+                        algo_select_params.reduction_scheme = reduction_scheme;
+                        algo_select_params.stages = stages[stage_id];
+                        algo_select_params.workspace_size = temp_storage_bytes;
+                        algo_select_params.workspace = d_temp_storage->ptr();
+                        params.emplace_back(algo_select_params);
+                        step++;
+                      }
+                    }  // end if
+                  }
+                } else {
+                  // Prepare algos
+                  cublasLtMatmulHeuristicResult_t heurResult;
+                  // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCheck
+                  status = dynload::cublasLtMatmulAlgoCheck(handle,
+                                                            matmul_desc,
+                                                            a_desc,
+                                                            b_desc,
+                                                            bias_desc,
+                                                            c_desc,
+                                                            &algo,
+                                                            &heurResult);
+                  if (status == CUBLAS_STATUS_SUCCESS) {
+                    size_t temp_storage_bytes = heurResult.workspaceSize;
+                    auto d_temp_storage = phi::memory_utils::Alloc(
+                        phi::GPUPlace(backends::gpu::GetCurrentDeviceId()),
+                        temp_storage_bytes);
+                    CublasLtAlgoSelectorParam algo_select_params;
+                    algo_select_params.algo = algo;
+                    algo_select_params.m = m;
+                    algo_select_params.n = n;
+                    algo_select_params.k = k;
+                    algo_select_params.algo_id = algo_ids[idx];
+                    algo_select_params.tile = tiles[tile_id];
+                    algo_select_params.swizzle = k;
+                    algo_select_params.custom_option = custom_option;
+                    algo_select_params.split_k_val = split_k_val;
+                    algo_select_params.reduction_scheme = reduction_scheme;
+                    algo_select_params.stages = stages[stage_id];
+                    algo_select_params.workspace_size = temp_storage_bytes;
+                    algo_select_params.workspace = d_temp_storage->ptr();
+                    params.emplace_back(algo_select_params);
+                    step++;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    cudaEvent_t start_event;
+    cudaEvent_t stop_event;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
+
+    if (step == 0) {
+      VLOG(3) << "No algo can be used";
+      return nullptr;
+    }
+
+    VLOG(3) << "CublasLtAlgoSelect Start testRun " << step << " "
+            << params.size();
+
+    for (int i = 0; i < step; i++) {
+      RunAndMeasureAlgo(handle,
+                        matmul_desc,
+                        a_desc,
+                        b_desc,
+                        bias_desc,
+                        c_desc,
+                        alpha,
+                        beta,
+                        a,
+                        b,
+                        bias,
+                        c,
+                        params[i],
+                        start_event,
+                        stop_event,
+                        stream);
+    }
+    std::sort(params.begin(), params.end(), compare_algo_time);
+
+    size_t res_id = 0;
+    while (params[res_id].time == 0) res_id++;
+
+    if (res_id >= params.size()) {
+      VLOG(3) << "No algo can be used";
+      return nullptr;
+    }
+
+    VLOG(3) << "algo selected";
+
+    ret = params[res_id].algo;
+    std::lock_guard<std::mutex> lock(cache_mutex_);
+    auto& algo_in_map = map_[seed];
+    algo_in_map = ret;
+    return &algo_in_map;
+  }
+
+  // Serialize map_ to cache file
+  void serialize_algo_cache_file() {
+    if (search_times_ > 0) {
+      int dev;
+      cudaGetDevice(&dev);
+      if (dev == 0) {
+        std::ofstream outfile;
+        outfile.open(config_filename_, std::ios::out | std::ios::trunc);
+        outfile << dynload::cublasLtGetCudartVersion() << std::endl;
+
+        for (const auto& p : map_) {
+          outfile << p.first << " ";
+          for (size_t i : p.second.data) {
+            outfile << i << " ";
+          }
+          outfile << std::endl;
+        }
+        outfile.close();
+      }
+    }
+  }
+  ~CublasLtAlgoCache() { serialize_algo_cache_file(); }
+
+ private:
+  explicit CublasLtAlgoCache(int search_times)
+      : search_times_(search_times), has_config_file_(true) {
+    // Init map_ from cache file
+    std::ifstream infile;
+    infile.open(config_filename_);
+    if (!infile.is_open()) {
+      has_config_file_ = false;
+      VLOG(3) << "No CublasLtAlgoCache file found";
+      return;
+    }
+    size_t cublaslt_version, real_cublaslt_version;
+    int64_t seed = 0;
+    std::array<uint64_t, 8> algo_data;
+    infile >> cublaslt_version;
+    VLOG(1) << "cublaslt_version " << cublaslt_version;
+
+    if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
+      LOG(INFO) << config_filename_
+                << " is not compatible with current cublaslt_version "
+                << real_cublaslt_version;
+      return;
+    }
+
+    while (!infile.eof()) {
+      infile >> seed >> algo_data[0] >> algo_data[1] >> algo_data[2] >>
+          algo_data[3] >> algo_data[4] >> algo_data[5] >> algo_data[6] >>
+          algo_data[7];
+
+      for (int i = 0; i < 8; ++i) {
+        map_[seed].data[i] = algo_data[i];
+      }
+    }
+    infile.close();
+  }
+
+  std::string config_filename_{"./paddle_cublaslt_cache"};
+  std::unordered_map<int64_t, cublasLtMatmulAlgo_t> map_;
+  int search_times_;
+  static constexpr int requested_algo_count_ = 100;
+  std::mutex cache_mutex_;
+  bool has_config_file_;
+
+  inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val) {
+    n--;
+    n |= (n >> 1);
+    n |= (n >> 2);
+    n |= (n >> 4);
+    n |= (n >> 8);
+    n |= (n >> 16);
+    return std::max(min_val, (n + 1));
+  }
+
+  void HashMatmulDesc(cublasLtMatmulDesc_t desc,
+                      int64_t* seed,
+                      const std::hash<int64_t>& hash_fn) {
+    size_t size_to_write;
+    int trans_a, trans_b;
+    uint32_t epilogue;
+    // int8_t fast_accum;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescGetAttribute(desc,
+                                                CUBLASLT_MATMUL_DESC_TRANSA,
+                                                &trans_a,
+                                                sizeof(trans_a),
+                                                &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(trans_a));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescGetAttribute(desc,
+                                                CUBLASLT_MATMUL_DESC_TRANSB,
+                                                &trans_b,
+                                                sizeof(trans_b),
+                                                &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(trans_b));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescGetAttribute(desc,
+                                                CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                                &epilogue,
+                                                sizeof(epilogue),
+                                                &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(epilogue));
+
+    // PADDLE_ENFORCE_GPU_SUCCESS(
+    //     dyl::cublasLtMatmulDescGetAttribute(desc,
+    //                                         CUBLASLT_MATMUL_DESC_FAST_ACCUM,
+    //                                         &fast_accum,
+    //                                         sizeof(fast_accum),
+    //                                         &size_to_write));
+    // HashValue(seed, hash_fn, static_cast<int64_t>(fast_accum));
+  }
+
+  void HashMatrixLayoutDesc(cublasLtMatrixLayout_t desc,
+                            int64_t* seed,
+                            const std::hash<int64_t>& hash_fn) {
+    size_t size_to_write;
+    uint32_t dtype;
+    int32_t batch;
+    uint64_t row, col;
+    int64_t ld, batch_offset;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatrixLayoutGetAttribute(desc,
+                                                  CUBLASLT_MATRIX_LAYOUT_TYPE,
+                                                  &dtype,
+                                                  sizeof(dtype),
+                                                  &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(dtype));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
+        desc,
+        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+        &batch,
+        sizeof(batch),
+        &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(batch));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
+        desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row), &size_to_write));
+    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(row, 32));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
+        desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col), &size_to_write));
+    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(col, 32));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
+        desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
+    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(ld, 32));
+
+    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
+    //     desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row),
+    //     &size_to_write));
+    // HashValue(seed, hash_fn, row);
+
+    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
+    //     desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col),
+    //     &size_to_write));
+    // HashValue(seed, hash_fn, col);
+
+    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
+    //     desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
+    // HashValue(seed, hash_fn, ld);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
+        desc,
+        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+        &batch_offset,
+        sizeof(batch_offset),
+        &size_to_write));
+    HashValue(seed, hash_fn, static_cast<int64_t>(batch_offset));
+  }
+
+  void HashValue(int64_t* seed,
+                 const std::hash<int64_t>& hash_fn,
+                 int64_t value) {
+    *seed ^= hash_fn(value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  }
+};
+
+}  // namespace cublaslt_internal
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
index 08d05d50b8bc70..5ffc7767f05847 100644
--- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
@@ -28,8 +28,10 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h"
 
 COMMON_DECLARE_int64(cublaslt_exhaustive_search_times);
+COMMON_DECLARE_bool(enable_blaslt_global_search);
 #endif
 
 namespace phi {
@@ -197,6 +199,14 @@ struct MatmulDescriptor {
   cublasLtMatrixLayout_t out_desc{nullptr};
   cublasLtMatmulAlgo_t* algo{nullptr};
   bool is_cached{false};
+  int64_t M_{-1};
+  int64_t N_{-1};
+  int64_t K_{-1};
+  cublasComputeType_t compute_type_;
+  cudaDataType_t scale_type_;
+  cudaDataType_t x_type_;
+  cudaDataType_t y_type_;
+  cudaDataType_t out_type_;
 
   MatmulDescriptor() {}
   MatmulDescriptor(const MatmulDescriptor& obj) {
@@ -276,6 +286,15 @@ struct MatmulDescriptor {
       SetBatchAndStride(y_desc, batch_size, stride_y);
       SetBatchAndStride(out_desc, batch_size, stride_out);
     }
+
+    M_ = M;
+    N_ = N;
+    K_ = K;
+    compute_type_ = compute_type;
+    scale_type_ = scale_type;
+    x_type_ = mat_type;
+    y_type_ = mat_type;
+    out_type_ = out_mat_type;
   }
 
   cublasLtMatmulAlgo_t* SetAlgo() {
@@ -668,27 +687,48 @@ struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
     cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
 
     size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
-
-    if (planner != nullptr) {
-      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
-          (!desc->is_cached)) {
-        SearchBestAlgo(ctx,
-                       cublaslt_handle,
-                       desc,
-                       static_cast<void*>(&alpha),
-                       static_cast<void*>(&beta),
-                       y_ptr,
-                       x_ptr,
-                       out_ptr,
-                       workspace->ptr(),
-                       workspace_size);
-        MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
-        VLOG(6) << best_desc->GetDescResultString(
-            "[Searched CublasltDescriptor] ");
-
-        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+    phi::Allocator::AllocationPtr workspace = nullptr;
+
+    if (FLAGS_enable_blaslt_global_search && planner != nullptr &&
+        !desc->is_cached) {
+      SearchBestAlgoGlobal(ctx,
+                           cublaslt_handle,
+                           desc,
+                           static_cast<void*>(&alpha),
+                           static_cast<void*>(&beta),
+                           y_ptr,
+                           x_ptr,
+                           out_ptr,
+                           workspace,
+                           workspace_size);
+      MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
+      VLOG(6) << best_desc->GetDescResultString(
+          "[Searched CublasltDescriptor] ");
+
+      auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+      cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+    } else {
+      workspace = GetWorkspace(ctx, workspace_size);
+      if (planner != nullptr) {
+        if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
+            (!desc->is_cached)) {
+          SearchBestAlgo(ctx,
+                         cublaslt_handle,
+                         desc,
+                         static_cast<void*>(&alpha),
+                         static_cast<void*>(&beta),
+                         y_ptr,
+                         x_ptr,
+                         out_ptr,
+                         workspace->ptr(),
+                         workspace_size);
+          MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
+          VLOG(6) << best_desc->GetDescResultString(
+              "[Searched CublasltDescriptor] ");
+
+          auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+          cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+        }
       }
     }
 
@@ -712,6 +752,77 @@ struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
                                 ctx.stream()));
   }
 
+  static void SearchBestAlgoGlobal(
+      const phi::GPUContext& ctx,
+      const cublasLtHandle_t& lt_handle,
+      MatmulDescriptor* desc,
+      const void* alpha,
+      const void* beta,
+      const void* y_data,
+      const void* x_data,
+      void* out_data,
+      phi::Allocator::AllocationPtr& workspace,  // NOLINT
+      size_t& workspace_size) {                  // NOLINT
+    void* bias_ptr = nullptr;
+    cublasLtMatmulAlgo_t* algo =
+        cublaslt_internal::CublasLtAlgoCache::Instance().CublasLtAlgoSelect(
+            lt_handle,
+            desc->M_,
+            desc->N_,
+            desc->K_,
+            1,
+            y_data,
+            x_data,
+            bias_ptr,
+            out_data,
+            const_cast<void*>(alpha),
+            const_cast<void*>(beta),
+            desc->op_desc,
+            desc->y_desc,
+            desc->x_desc,
+            desc->out_desc,
+            desc->out_desc,
+            desc->compute_type_,
+            desc->scale_type_,
+            desc->y_type_,
+            desc->x_type_,
+            desc->out_type_,
+            desc->out_type_,
+            ctx.stream());
+    if (algo == nullptr) {
+      LOG(WARNING) << "CublasLtAlgoSelect failed, result is empty! We attempt "
+                      "to use Heuristic search.";
+      workspace_size = static_cast<size_t>(64) * 1024 * 1024;
+      workspace = GetWorkspace(ctx, workspace_size);
+      SearchBestAlgo(ctx,
+                     lt_handle,
+                     desc,
+                     static_cast<void*>(&alpha),
+                     static_cast<void*>(&beta),
+                     y_data,
+                     x_data,
+                     out_data,
+                     workspace->ptr(),
+                     workspace_size);
+    } else {
+      cublasLtMatmulHeuristicResult_t heurResult;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cublasLtMatmulAlgoCheck(ctx.cublaslt_handle(),
+                                           desc->op_desc,
+                                           desc->y_desc,
+                                           desc->x_desc,
+                                           desc->out_desc,
+                                           desc->out_desc,
+                                           algo,
+                                           &heurResult));
+      cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
+      *best_algo = *algo;
+      workspace_size = heurResult.workspaceSize;
+      workspace = phi::memory_utils::Alloc(
+          phi::GPUPlace(backends::gpu::GetCurrentDeviceId()), workspace_size);
+    }
+  }
+
   static void SearchBestAlgo(const phi::GPUContext& ctx,
                              const cublasLtHandle_t& lt_handle,
                              MatmulDescriptor* desc,
diff --git a/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/cublaslt_gemm.h b/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/cublaslt_gemm.h
index e661a6af7d0e75..c679c4d02f57eb 100644
--- a/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/cublaslt_gemm.h
+++ b/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/cublaslt_gemm.h
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h"
 
 namespace dyl = phi::dynload;
 
@@ -48,666 +49,6 @@ namespace cutlass_internal {
           "refer https://docs.nvidia.com/cuda/cublas/index.html to get more " \
           "information"))
 
-const int split_k_candidates[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-
-struct CublasLtAlgoSelectorParam {
-  cublasLtMatmulAlgo_t algo;
-  int m;
-  int n;
-  int k;
-  int algo_id;
-  int swizzle;
-  int custom_option;
-  int tile;
-  int split_k_val;
-  int reduction_scheme;
-  int stages;
-  void* workspace;
-  size_t workspace_size;
-  float time;
-};
-
-inline bool compare_algo_time(const CublasLtAlgoSelectorParam& param_a,
-                              const CublasLtAlgoSelectorParam& param_b) {
-  return (param_a.time < param_b.time);
-}
-
-class CublasLtAlgoCache {
- public:
-  static CublasLtAlgoCache& Instance() {
-    static CublasLtAlgoCache instance(100);
-    return instance;
-  }
-
-  template <typename InT, typename OutT>
-  void TestMatmulRun(cublasLtHandle_t handle,
-                     cublasLtMatmulDesc_t matmul_desc,
-                     cublasLtMatrixLayout_t a_desc,
-                     cublasLtMatrixLayout_t b_desc,
-                     cublasLtMatrixLayout_t bias_desc,
-                     cublasLtMatrixLayout_t c_desc,
-                     void* alpha,
-                     void* beta,
-                     const InT* a,
-                     const InT* b,
-                     const OutT* bias,
-                     OutT* c,
-                     CublasLtAlgoSelectorParam& param,  // NOLINT
-                     cudaEvent_t& start_event,          // NOLINT
-                     cudaEvent_t& stop_event,           // NOLINT
-                     cudaStream_t stream) {
-    cublasStatus_t status;
-    cublasLtMatmulHeuristicResult_t heuristic_result;
-    status = dyl::cublasLtMatmulAlgoCheck(handle,
-                                          matmul_desc,
-                                          a_desc,
-                                          b_desc,
-                                          bias_desc,
-                                          c_desc,
-                                          &param.algo,
-                                          &heuristic_result);
-    PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCheck);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      param.time = std::numeric_limits<float>::max();
-      return;
-    }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
-    int repeats = search_times_;
-
-    for (int loop = 0; loop < repeats; loop++) {
-      status = dyl::cublasLtMatmul(handle,
-                                   matmul_desc,
-                                   alpha,
-                                   a,
-                                   a_desc,
-                                   b,
-                                   b_desc,
-                                   beta,
-                                   bias,
-                                   bias_desc,
-                                   c,
-                                   c_desc,
-                                   &param.algo,
-                                   param.workspace,
-                                   param.workspace_size,
-                                   stream);
-      if (status != CUBLAS_STATUS_SUCCESS) {
-        param.time = std::numeric_limits<float>::max();
-        return;
-      }
-    }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-
-    float time;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventElapsedTime(&time, start_event, stop_event));
-
-    param.time = time / repeats;
-  }
-
-  template <typename InT, typename OutT>
-  cublasLtMatmulAlgo_t* CublasLtAlgoSelect(cublasLtHandle_t handle,
-                                           int m,
-                                           int n,
-                                           int k,
-                                           int batch_count,
-                                           const InT* a,
-                                           const InT* b,
-                                           const OutT* bias,
-                                           OutT* c,
-                                           void* alpha,
-                                           void* beta,
-                                           cublasLtMatmulDesc_t matmul_desc,
-                                           cublasLtMatrixLayout_t a_desc,
-                                           cublasLtMatrixLayout_t b_desc,
-                                           cublasLtMatrixLayout_t bias_desc,
-                                           cublasLtMatrixLayout_t c_desc,
-                                           cublasComputeType_t compute_type,
-                                           cudaDataType_t scale_type,
-                                           cudaDataType_t a_type,
-                                           cudaDataType_t b_type,
-                                           cudaDataType_t bias_type,
-                                           cudaDataType_t c_type,
-                                           cudaStream_t stream) {
-    // If we don't have config file and we donot search, here return nullptr
-    if (!has_config_file_ && search_times_ <= 0) {
-      return nullptr;
-    }
-
-    // VLOG(0) << "m n k" << m << " " << n << " " << k;
-
-    int64_t seed = 0;
-    std::hash<int64_t> hash_fn;
-
-    HashMatmulDesc(matmul_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(a_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(b_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(bias_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(c_desc, &seed, hash_fn);
-
-    cublasLtMatmulAlgo_t ret;
-    {
-      std::lock_guard<std::mutex> lock(cache_mutex_);
-      auto it = map_.find(seed);
-      if (it != map_.end()) {
-        VLOG(3) << "CublasLtAlgoSelect Found in cache";
-        return &(it->second);
-      } else {
-        // if we have cache but not found algo, and we don't want to search,
-        // here return nullptr
-        if (search_times_ <= 0) {
-          return nullptr;
-        }
-      }
-    }
-    VLOG(3) << "CublasLtAlgoSelect Not Found in cache";
-
-    // Get Ids
-    // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoGetIds
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-    // std::vector<int> algo_ids(requested_algo_count_);
-    int algo_ids[requested_algo_count_];  // NOLINT
-
-    int num_algo_ids;
-    status = dyl::cublasLtMatmulAlgoGetIds(handle,
-                                           compute_type,
-                                           scale_type,
-                                           a_type,
-                                           b_type,
-                                           bias_type,
-                                           c_type,
-                                           requested_algo_count_,
-                                           algo_ids,
-                                           &num_algo_ids);
-    PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoGetIds);
-
-    // Traverse all posssible algo combinations
-    int step = 0;
-    int limit = 20000;
-    std::vector<CublasLtAlgoSelectorParam> params;
-
-    for (int idx = 0; idx < num_algo_ids; idx++) {
-      cublasLtMatmulAlgo_t algo;
-
-      /* Initialize algo structure with given Algp ID */
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoInit
-      status = dyl::cublasLtMatmulAlgoInit(handle,
-                                           compute_type,
-                                           scale_type,
-                                           a_type,
-                                           b_type,
-                                           bias_type,
-                                           c_type,
-                                           algo_ids[idx],
-                                           &algo);
-      PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoInit);
-
-      // Query the tiles enums supported by that algo which is used to alloc
-      // enough space to store it
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCapGetAttribute
-      size_t attr_size = 0;
-
-      int batch_support;
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT,
-          &batch_support,
-          sizeof(batch_support),
-          &attr_size);
-      PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-      if (batch_count > 1 && batch_support == 0) {
-        continue;
-      }
-
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_TILE_IDS, nullptr, 0, &attr_size);
-      PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-
-      int num_tiles = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> tiles(num_tiles == 0 ? 1 : num_tiles);
-      if (num_tiles == 0) {
-        tiles[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-        num_tiles = 1;
-      } else {
-        status =
-            dyl::cublasLtMatmulAlgoCapGetAttribute(&algo,
-                                                   CUBLASLT_ALGO_CAP_TILE_IDS,
-                                                   tiles.data(),
-                                                   sizeof(int) * num_tiles,
-                                                   &attr_size);
-        PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-      }
-
-      // Query the stages enums supported by that algo (cuda must >= 11.0)
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, nullptr, 0, &attr_size);
-      PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-      int num_stages = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> stages(num_stages == 0 ? 1 : num_stages);
-      if (num_stages == 0) {
-        stages[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-        num_stages = 1;
-      } else {
-        status =
-            dyl::cublasLtMatmulAlgoCapGetAttribute(&algo,
-                                                   CUBLASLT_ALGO_CAP_STAGES_IDS,
-                                                   stages.data(),
-                                                   sizeof(int) * num_stages,
-                                                   &attr_size);
-        PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-      }
-
-      // Retrieve Other Algo Capabilities attributes
-      int splitk_support, red_mask, swizzling_max, custom_option_max;
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_SPLITK_SUPPORT,
-          &splitk_support,
-          sizeof(splitk_support),
-          &attr_size);
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK,
-          &red_mask,
-          sizeof(red_mask),
-          &attr_size);
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT,
-          &swizzling_max,
-          sizeof(swizzling_max),
-          &attr_size);
-      status = dyl::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX,
-          &custom_option_max,
-          sizeof(custom_option_max),
-          &attr_size);
-      PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulAlgoCapGetAttribute);
-
-      /* Loop over the different tiles */
-      for (int tile_id = 0; tile_id < num_tiles && step < limit; tile_id++) {
-        /* Loop over different stages count */
-        for (int stage_id = 0; stage_id < num_stages && step < limit;
-             stage_id++) {
-          /* Loop over the different custom option if any */
-          for (int custom_option = 0;
-               custom_option <= custom_option_max && step < limit;
-               custom_option++) {
-            /* Loop over the CTAs swizzling support */
-            for (int k = 0; k <= swizzling_max && step < limit; k++) {
-              int splir_k_trial = 0;
-              if (splitk_support) {
-                splir_k_trial +=
-                    sizeof(split_k_candidates) / sizeof(split_k_candidates[0]);
-              }
-
-              for (int l = 0; (l < (1 + splir_k_trial)) && (step < limit);
-                   l++) {
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo,
-                    CUBLASLT_ALGO_CONFIG_TILE_ID,
-                    &tiles[tile_id],
-                    sizeof(tiles[tile_id]));
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo,
-                    CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                    &stages[stage_id],
-                    sizeof(stages[stage_id]));
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo,
-                    CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                    &custom_option,
-                    sizeof(custom_option));
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
-                int split_k_val = 1;
-                int reduction_scheme = CUBLASLT_REDUCTION_SCHEME_NONE;
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo,
-                    CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                    &split_k_val,
-                    sizeof(split_k_val));
-                status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                    &algo,
-                    CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                    &reduction_scheme,
-                    sizeof(int));
-                if (l > 0) {  // Split-K case
-                  split_k_val = split_k_candidates[l - 1];
-                  status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                      &algo,
-                      CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                      &split_k_candidates[l - 1],
-                      sizeof(split_k_candidates[l - 1]));
-                  for (reduction_scheme = 1;
-                       reduction_scheme <
-                           static_cast<int>(CUBLASLT_REDUCTION_SCHEME_MASK) &&
-                       (step < limit);
-                       reduction_scheme = reduction_scheme << 1) {
-                    if (reduction_scheme & red_mask) {
-                      status = dyl::cublasLtMatmulAlgoConfigSetAttribute(
-                          &algo,
-                          CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                          &reduction_scheme,
-                          sizeof(reduction_scheme));
-                      PADDLE_CUBLASLT_STATUS_CHECK(
-                          cublasLtMatmulAlgoConfigSetAttribute);
-
-                      cublasLtMatmulHeuristicResult_t heurResult;
-                      status = dyl::cublasLtMatmulAlgoCheck(handle,
-                                                            matmul_desc,
-                                                            a_desc,
-                                                            b_desc,
-                                                            bias_desc,
-                                                            c_desc,
-                                                            &algo,
-                                                            &heurResult);
-                      if (status == CUBLAS_STATUS_SUCCESS) {
-                        size_t temp_storage_bytes = heurResult.workspaceSize;
-                        auto d_temp_storage = phi::memory_utils::Alloc(
-                            phi::GPUPlace(
-                                phi::backends::gpu::GetCurrentDeviceId()),
-                            temp_storage_bytes);
-
-                        CublasLtAlgoSelectorParam algo_select_params;
-                        algo_select_params.algo = algo;
-                        algo_select_params.m = m;
-                        algo_select_params.n = n;
-                        algo_select_params.k = k;
-                        algo_select_params.algo_id = algo_ids[idx];
-                        algo_select_params.tile = tiles[tile_id];
-                        algo_select_params.swizzle = k;
-                        algo_select_params.custom_option = custom_option;
-                        algo_select_params.split_k_val = split_k_val;
-                        algo_select_params.reduction_scheme = reduction_scheme;
-                        algo_select_params.stages = stages[stage_id];
-                        algo_select_params.workspace_size = temp_storage_bytes;
-                        algo_select_params.workspace = d_temp_storage->ptr();
-                        params.emplace_back(algo_select_params);
-                        step++;
-                      }
-                    }  // end if
-                  }
-                } else {
-                  // Prepare algos
-                  cublasLtMatmulHeuristicResult_t heurResult;
-                  // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCheck
-                  status = dyl::cublasLtMatmulAlgoCheck(handle,
-                                                        matmul_desc,
-                                                        a_desc,
-                                                        b_desc,
-                                                        bias_desc,
-                                                        c_desc,
-                                                        &algo,
-                                                        &heurResult);
-                  if (status == CUBLAS_STATUS_SUCCESS) {
-                    size_t temp_storage_bytes = heurResult.workspaceSize;
-                    auto d_temp_storage = phi::memory_utils::Alloc(
-                        phi::GPUPlace(backends::gpu::GetCurrentDeviceId()),
-                        temp_storage_bytes);
-                    CublasLtAlgoSelectorParam algo_select_params;
-                    algo_select_params.algo = algo;
-                    algo_select_params.m = m;
-                    algo_select_params.n = n;
-                    algo_select_params.k = k;
-                    algo_select_params.algo_id = algo_ids[idx];
-                    algo_select_params.tile = tiles[tile_id];
-                    algo_select_params.swizzle = k;
-                    algo_select_params.custom_option = custom_option;
-                    algo_select_params.split_k_val = split_k_val;
-                    algo_select_params.reduction_scheme = reduction_scheme;
-                    algo_select_params.stages = stages[stage_id];
-                    algo_select_params.workspace_size = temp_storage_bytes;
-                    algo_select_params.workspace = d_temp_storage->ptr();
-                    params.emplace_back(algo_select_params);
-                    step++;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    cudaEvent_t start_event;
-    cudaEvent_t stop_event;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
-
-    if (step == 0) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "CublasLtAlgoSelect Start testRun " << step << " "
-            << params.size();
-
-    for (int i = 0; i < step; i++) {
-      TestMatmulRun(handle,
-                    matmul_desc,
-                    a_desc,
-                    b_desc,
-                    bias_desc,
-                    c_desc,
-                    alpha,
-                    beta,
-                    a,
-                    b,
-                    bias,
-                    c,
-                    params[i],
-                    start_event,
-                    stop_event,
-                    stream);
-    }
-    std::sort(params.begin(), params.end(), compare_algo_time);
-
-    int res_id = 0;
-    while (params[res_id].time == 0) res_id++;
-
-    if (res_id >= params.size()) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "algo selected";
-
-    ret = params[res_id].algo;
-    std::lock_guard<std::mutex> lock(cache_mutex_);
-    auto& algo_in_map = map_[seed];
-    algo_in_map = ret;
-    return &algo_in_map;
-  }
-
-  // Serialize map_ to cache file
-  void serialize_algo_cache_file() {
-    if (search_times_ > 0) {
-      int dev;
-      cudaGetDevice(&dev);
-      if (dev == 0) {
-        std::ofstream outfile;
-        outfile.open(config_filename_, std::ios::out | std::ios::trunc);
-        outfile << dyl::cublasLtGetCudartVersion() << std::endl;
-
-        for (const auto& p : map_) {
-          outfile << p.first << " ";
-          for (int i = 0; i < 8; ++i) {
-            outfile << p.second.data[i] << " ";
-          }
-          outfile << std::endl;
-        }
-        outfile.close();
-      }
-    }
-  }
-  ~CublasLtAlgoCache() { serialize_algo_cache_file(); }
-
- private:
-  explicit CublasLtAlgoCache(int search_times)
-      : search_times_(search_times), has_config_file_(true) {
-    // Init map_ from cache file
-    std::ifstream infile;
-    infile.open(config_filename_);
-    if (!infile.is_open()) {
-      has_config_file_ = false;
-      VLOG(3) << "No CublasLtAlgoCache file found";
-      return;
-    }
-    size_t cublaslt_version, real_cublaslt_version;
-    int64_t seed = 0;
-    uint64_t algo_data[8];
-    infile >> cublaslt_version;
-    VLOG(1) << "cublaslt_version " << cublaslt_version;
-
-    if (dyl::cublasLtGetCudartVersion() != cublaslt_version) {
-      LOG(INFO) << config_filename_
-                << " is not compatible with current cublaslt_version "
-                << real_cublaslt_version;
-      return;
-    }
-
-    while (!infile.eof()) {
-      infile >> seed >> algo_data[0] >> algo_data[1] >> algo_data[2] >>
-          algo_data[3] >> algo_data[4] >> algo_data[5] >> algo_data[6] >>
-          algo_data[7];
-
-      for (int i = 0; i < 8; ++i) {
-        map_[seed].data[i] = algo_data[i];
-      }
-    }
-    infile.close();
-  }
-
-  std::string config_filename_{"./paddle_cublaslt_cache"};
-  std::unordered_map<int64_t, cublasLtMatmulAlgo_t> map_;
-  int search_times_;
-  const int requested_algo_count_ = 100;
-  std::mutex cache_mutex_;
-  bool has_config_file_;
-
-  inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val) {
-    n--;
-    n |= (n >> 1);
-    n |= (n >> 2);
-    n |= (n >> 4);
-    n |= (n >> 8);
-    n |= (n >> 16);
-    return std::max(min_val, (n + 1));
-  }
-
-  void HashMatmulDesc(cublasLtMatmulDesc_t desc,
-                      int64_t* seed,
-                      const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    int trans_a, trans_b;
-    uint32_t epilogue;
-    // int8_t fast_accum;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dyl::cublasLtMatmulDescGetAttribute(desc,
-                                            CUBLASLT_MATMUL_DESC_TRANSA,
-                                            &trans_a,
-                                            sizeof(trans_a),
-                                            &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_a));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dyl::cublasLtMatmulDescGetAttribute(desc,
-                                            CUBLASLT_MATMUL_DESC_TRANSB,
-                                            &trans_b,
-                                            sizeof(trans_b),
-                                            &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_b));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dyl::cublasLtMatmulDescGetAttribute(desc,
-                                            CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                            &epilogue,
-                                            sizeof(epilogue),
-                                            &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(epilogue));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(
-    //     dyl::cublasLtMatmulDescGetAttribute(desc,
-    //                                         CUBLASLT_MATMUL_DESC_FAST_ACCUM,
-    //                                         &fast_accum,
-    //                                         sizeof(fast_accum),
-    //                                         &size_to_write));
-    // HashValue(seed, hash_fn, static_cast<int64_t>(fast_accum));
-  }
-
-  void HashMatrixLayoutDesc(cublasLtMatrixLayout_t desc,
-                            int64_t* seed,
-                            const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    uint32_t dtype;
-    int32_t batch;
-    uint64_t row, col;
-    int64_t ld, batch_offset;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dyl::cublasLtMatrixLayoutGetAttribute(desc,
-                                              CUBLASLT_MATRIX_LAYOUT_TYPE,
-                                              &dtype,
-                                              sizeof(dtype),
-                                              &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(dtype));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch,
-        sizeof(batch),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(row, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(col, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(ld, 32));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, row);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, col);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    // HashValue(seed, hash_fn, ld);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &batch_offset,
-        sizeof(batch_offset),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch_offset));
-  }
-
-  void HashValue(int64_t* seed,
-                 const std::hash<int64_t>& hash_fn,
-                 int64_t value) {
-    *seed ^= hash_fn(value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-  }
-};
-
 template <typename T>
 inline cudaDataType_t GetCublasLtDataType() {
   return CUDA_R_32F;
@@ -857,30 +198,31 @@ void CublasLtMatmulFP8(const phi::GPUContext& dev_ctx,
     PADDLE_CUBLASLT_STATUS_CHECK(cublasLtMatmulDescSetAttribute);
   }
 
-  cublasLtMatmulAlgo_t* algo = CublasLtAlgoCache::Instance().CublasLtAlgoSelect(
-      dev_ctx.cublaslt_handle(),
-      m,
-      n,
-      k,
-      batch_count,
-      mat_b.data<phi::dtype::float8_e4m3fn>(),
-      mat_a.data<phi::dtype::float8_e4m3fn>(),
-      bias_ptr,
-      out->data<T>(),
-      &alpha_,
-      &beta_,
-      matmul_desc_,
-      B_desc_,
-      A_desc_,
-      Bias_desc_,
-      C_desc_,
-      CUBLAS_COMPUTE_32F,
-      CUDA_R_32F,
-      B_type,
-      A_type,
-      Bias_type,
-      C_type,
-      dev_ctx.stream());
+  cublasLtMatmulAlgo_t* algo =
+      funcs::cublaslt_internal::CublasLtAlgoCache::Instance()
+          .CublasLtAlgoSelect(dev_ctx.cublaslt_handle(),
+                              m,
+                              n,
+                              k,
+                              batch_count,
+                              mat_b.data<phi::dtype::float8_e4m3fn>(),
+                              mat_a.data<phi::dtype::float8_e4m3fn>(),
+                              bias_ptr,
+                              out->data<T>(),
+                              &alpha_,
+                              &beta_,
+                              matmul_desc_,
+                              B_desc_,
+                              A_desc_,
+                              Bias_desc_,
+                              C_desc_,
+                              CUBLAS_COMPUTE_32F,
+                              CUDA_R_32F,
+                              B_type,
+                              A_type,
+                              Bias_type,
+                              C_type,
+                              dev_ctx.stream());
 
   if (algo == nullptr) {
     int returnedResults = 0;

From 239bf7b94f0b3dd84ecb995732636873555c7a2a Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:20:08 +0800
Subject: [PATCH 10/16] Clean some tests (#65663)

* Fix

* Fix

* Fix

* Fix

* ci
---
 test/deprecated/legacy_test/test_crop_op.py   | 160 -----
 test/legacy_test/CMakeLists.txt               |  15 -
 test/legacy_test/test_bicubic_interp_op.py    | 518 ---------------
 test/legacy_test/test_bilinear_interp_op.py   | 520 ---------------
 .../legacy_test/test_generate_proposals_op.py | 452 -------------
 .../test_generate_proposals_v2_op.py          | 168 ++++-
 test/legacy_test/test_linear_interp_op.py     | 381 -----------
 test/legacy_test/test_lookup_table_op.py      | 437 -------------
 test/legacy_test/test_matmul_op.py            | 249 -------
 test/legacy_test/test_nearest_interp_op.py    | 471 --------------
 test/legacy_test/test_trilinear_interp_op.py  | 613 ------------------
 test/mkldnn/test_bilinear_interp_mkldnn_op.py | 204 ------
 test/mkldnn/test_matmul_mkldnn_op.py          | 260 --------
 test/mkldnn/test_nearest_interp_mkldnn_op.py  | 203 ------
 test/xpu/CMakeLists.txt                       |   1 -
 test/xpu/test_bilinear_interp_op_xpu.py       | 508 ---------------
 test/xpu/test_matmul_op_xpu.py                | 387 -----------
 test/xpu/test_nearest_interp_op_xpu.py        | 441 -------------
 18 files changed, 167 insertions(+), 5821 deletions(-)
 delete mode 100644 test/deprecated/legacy_test/test_crop_op.py
 delete mode 100644 test/legacy_test/test_bicubic_interp_op.py
 delete mode 100755 test/legacy_test/test_bilinear_interp_op.py
 delete mode 100644 test/legacy_test/test_generate_proposals_op.py
 delete mode 100755 test/legacy_test/test_linear_interp_op.py
 delete mode 100644 test/legacy_test/test_lookup_table_op.py
 delete mode 100644 test/legacy_test/test_matmul_op.py
 delete mode 100755 test/legacy_test/test_nearest_interp_op.py
 delete mode 100755 test/legacy_test/test_trilinear_interp_op.py
 delete mode 100644 test/mkldnn/test_bilinear_interp_mkldnn_op.py
 delete mode 100644 test/mkldnn/test_matmul_mkldnn_op.py
 delete mode 100644 test/mkldnn/test_nearest_interp_mkldnn_op.py
 delete mode 100755 test/xpu/test_bilinear_interp_op_xpu.py
 delete mode 100644 test/xpu/test_matmul_op_xpu.py
 delete mode 100644 test/xpu/test_nearest_interp_op_xpu.py

diff --git a/test/deprecated/legacy_test/test_crop_op.py b/test/deprecated/legacy_test/test_crop_op.py
deleted file mode 100644
index 858fd89fc7e998..00000000000000
--- a/test/deprecated/legacy_test/test_crop_op.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-
-
-def crop(data, offsets, crop_shape):
-    def indexOf(shape, index):
-        result = []
-        for dim in reversed(shape):
-            result.append(index % dim)
-            index = index / dim
-        return result[::-1]
-
-    result = []
-    for i, value in enumerate(data.flatten()):
-        index = indexOf(data.shape, i)
-        selected = True
-        if len(index) == len(offsets):
-            for j, offset in enumerate(offsets):
-                selected = (
-                    selected
-                    and index[j] >= offset
-                    and index[j] < crop_shape[j] + offset
-                )
-            if selected:
-                result.append(value)
-    return np.array(result).reshape(crop_shape)
-
-
-class TestCropOp(OpTest):
-    def setUp(self):
-        self.op_type = "crop"
-        self.crop_by_input = False
-        self.offset_by_input = False
-        self.attrs = {}
-        self.initTestCase()
-        if self.crop_by_input:
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float64"),
-                'Y': np.random.random(self.crop_shape).astype("float64"),
-            }
-        else:
-            self.attrs['shape'] = self.crop_shape
-            self.inputs = {
-                'X': np.random.random(self.x_shape).astype("float64"),
-            }
-        if self.offset_by_input:
-            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
-        else:
-            self.attrs['offsets'] = self.offsets
-        if self.offsets is None:
-            self.offsets = [0] * len(self.crop_shape)
-        if self.crop_shape is None:
-            self.crop_shape = self.x_shape
-
-        self.outputs = {
-            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
-        }
-
-    def initTestCase(self):
-        self.x_shape = (10, 10)
-        self.crop_shape = (2, 2)
-        self.offsets = [1, 2]
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
-
-
-class TestCase1(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (16, 8, 32)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-
-
-class TestCase2(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (15, 8)
-        self.crop_shape = [15, 8]
-        self.offsets = [0, 0]
-
-
-class TestCase3(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (4, 8, 16)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 5, 3]
-        self.crop_by_input = True
-
-
-class TestCase4(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (10, 10)
-        self.crop_shape = [10, 10]
-        self.offsets = [0, 0]
-        self.crop_by_input = True
-
-
-class TestCase5(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (3, 4, 10)
-        self.crop_shape = [2, 2, 3]
-        self.offsets = [1, 0, 2]
-        self.offset_by_input = True
-
-
-class TestCase6(TestCropOp):
-    def initTestCase(self):
-        self.x_shape = (10, 9, 14)
-        self.crop_shape = [3, 3, 5]
-        self.offsets = [3, 5, 4]
-        self.crop_by_input = True
-        self.offset_by_input = True
-
-
-class TestCropNoneOffset(unittest.TestCase):
-    def test_crop_none_offset(self):
-        x = paddle.static.data(name="input1", shape=[3, 6, 6], dtype="float32")
-        crop_shape = [2, 2, 2]
-        crop = paddle.crop(x, crop_shape, None)
-        self.assertEqual(crop.shape, (2, 2, 2))
-
-
-class TestCropNoneShape(unittest.TestCase):
-    def test_crop_none_shape(self):
-        x = paddle.static.data(name="input1", shape=[3, 6, 6], dtype="float32")
-        crop = paddle.crop(x)
-        self.assertEqual(crop.shape, (3, 6, 6))
-
-
-class TestCropError(unittest.TestCase):
-    def test_neg_offset_error(self):
-        with self.assertRaises(ValueError):
-            x = paddle.static.data(name='input2', shape=[1], dtype="float32")
-            out = paddle.crop(x, offsets=[-1])
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index e12f367f355218..475099f3b02e75 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -439,8 +439,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 list(REMOVE_ITEM TEST_OPS test_layers)
-list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
-list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -493,7 +491,6 @@ set(TEST_OPS_WITH_GC
     test_scatter_op
     test_concat_op
     test_elementwise_add_op
-    test_lookup_table_op
     test_elementwise_sub_op
     test_gather_op
     test_mean_op
@@ -579,11 +576,6 @@ if((WITH_GPU) AND (WITH_CUDNN_FRONTEND))
                   test_fused_dot_product_attention_op_static)
 endif()
 
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS
-                ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS
-                ${GC_ENVS})
-
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
@@ -1043,7 +1035,6 @@ set(STATIC_BUILD_TESTS
     test_nce
     test_layer_norm_op
     test_eigh_op
-    test_matmul_op
     test_matmul_v2_op
     test_paddle_save_load_binary
     test_assign_pos_op
@@ -1153,8 +1144,6 @@ set_tests_properties(test_radam_op PROPERTIES TIMEOUT 100)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_linalg_cholesky_inverse PROPERTIES TIMEOUT 100)
 set_tests_properties(test_sparse_mask_as_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_op_depthwise_conv
                      PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
@@ -1173,9 +1162,7 @@ set_tests_properties(test_imperative_star_gan_with_gradient_penalty
 set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pad3d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
@@ -1190,9 +1177,7 @@ else()
 endif()
 set_tests_properties(test_svd_op PROPERTIES TIMEOUT 80)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
-set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120)
 set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120)
 set_tests_properties(test_paddle_save_load_binary_static_build
                      PROPERTIES TIMEOUT 120)
diff --git a/test/legacy_test/test_bicubic_interp_op.py b/test/legacy_test/test_bicubic_interp_op.py
deleted file mode 100644
index d9c68bd4c09bf4..00000000000000
--- a/test/legacy_test/test_bicubic_interp_op.py
+++ /dev/null
@@ -1,518 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-from paddle import base
-from paddle.nn.functional import interpolate
-from paddle.pir_utils import test_with_pir_api
-
-
-def cubic_1(x, a):
-    return ((a + 2) * x - (a + 3)) * x * x + 1
-
-
-def cubic_2(x, a):
-    return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a
-
-
-def cubic_interp1d(x0, x1, x2, x3, t):
-    param = [0, 0, 0, 0]
-    a = -0.75
-    x_1 = t
-    x_2 = 1.0 - t
-    param[0] = cubic_2(x_1 + 1.0, a)
-    param[1] = cubic_1(x_1, a)
-    param[2] = cubic_1(x_2, a)
-    param[3] = cubic_2(x_2 + 1.0, a)
-    return x0 * param[0] + x1 * param[1] + x2 * param[2] + x3 * param[3]
-
-
-def value_bound(input, w, h, x, y):
-    access_x = int(max(min(x, w - 1), 0))
-    access_y = int(max(min(y, h - 1), 0))
-    return input[:, :, access_y, access_x]
-
-
-def bicubic_interp_np(
-    input,
-    out_h,
-    out_w,
-    out_size=None,
-    actual_shape=None,
-    align_corners=True,
-    data_layout='kNCHW',
-):
-    """trilinear interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    batch_size, channel, in_h, in_w = input.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        if align_corners:
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-
-    if out_w > 1:
-        if align_corners:
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-
-    for k in range(out_h):
-        if align_corners:
-            h = ratio_h * k
-        else:
-            h = ratio_h * (k + 0.5) - 0.5
-        input_y = np.floor(h)
-        y_t = h - input_y
-        for l in range(out_w):
-            if align_corners:
-                w = ratio_w * l
-            else:
-                w = ratio_w * (l + 0.5) - 0.5
-            input_x = np.floor(w)
-            x_t = w - input_x
-            for i in range(batch_size):
-                for j in range(channel):
-                    coefficients = [0, 0, 0, 0]
-                    for ii in range(4):
-                        access_x_0 = int(max(min(input_x - 1, in_w - 1), 0))
-                        access_x_1 = int(max(min(input_x + 0, in_w - 1), 0))
-                        access_x_2 = int(max(min(input_x + 1, in_w - 1), 0))
-                        access_x_3 = int(max(min(input_x + 2, in_w - 1), 0))
-                        access_y = int(max(min(input_y - 1 + ii, in_h - 1), 0))
-
-                        coefficients[ii] = cubic_interp1d(
-                            input[i, j, access_y, access_x_0],
-                            input[i, j, access_y, access_x_1],
-                            input[i, j, access_y, access_x_2],
-                            input[i, j, access_y, access_x_3],
-                            x_t,
-                        )
-                    out[i, j, k, l] = cubic_interp1d(
-                        coefficients[0],
-                        coefficients[1],
-                        coefficients[2],
-                        coefficients[3],
-                        y_t,
-                    )
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-    return out.astype(input.dtype)
-
-
-class TestBicubicInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "bicubic_interp"
-        # NOTE(dev): some AsDispensible input is not used under imperative mode.
-        input_np = np.random.random(self.input_shape).astype("float64")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bicubic_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.data_layout,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'data_layout': self.data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [2, 3, 5, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-class TestBicubicInterpCase1(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestBicubicInterpCase2(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 8
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestBicubicInterpCase3(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.align_corners = False
-
-
-class TestBicubicInterpCase4(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestBicubicInterpCase5(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 11
-        self.out_w = 11
-        self.scale = 0.0
-        self.out_size = np.array([6, 4]).astype("int32")
-        self.align_corners = False
-
-
-class TestBicubicInterpCase6(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0
-        self.out_size = np.array([64, 32]).astype("int32")
-        self.align_corners = False
-
-
-class TestBicubicInterpSame(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestBicubicInterpDataLayout(TestBicubicInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bicubic'
-        self.input_shape = [2, 5, 5, 3]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-
-
-class TestBicubicInterpOpAPI(unittest.TestCase):
-    def test_case(self):
-        np.random.seed(200)
-        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
-        dim_data = np.array([12]).astype("int32")
-        shape_data = np.array([12, 12]).astype("int32")
-        actual_size_data = np.array([12, 12]).astype("int32")
-        scale_data = np.array([2.0]).astype("float32")
-
-        prog = base.Program()
-        startup_prog = base.Program()
-        place = (
-            base.CUDAPlace(0)
-            if base.core.is_compiled_with_cuda()
-            else base.CPUPlace()
-        )
-
-        with base.program_guard(prog, startup_prog):
-            x = paddle.static.data(
-                name="x", shape=[2, 3, 6, 6], dtype="float32"
-            )
-
-            dim = paddle.static.data(name="dim", shape=[1], dtype="int32")
-            shape_tensor = paddle.static.data(
-                name="shape_tensor", shape=[2], dtype="int32"
-            )
-            actual_size = paddle.static.data(
-                name="actual_size", shape=[2], dtype="int32"
-            )
-            scale_tensor = paddle.static.data(
-                name="scale_tensor", shape=[1], dtype="float32"
-            )
-
-            out1 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False
-            )
-            out2 = interpolate(
-                x, size=[12, dim], mode='bicubic', align_corners=False
-            )
-            out3 = interpolate(
-                x, size=shape_tensor, mode='bicubic', align_corners=False
-            )
-            out4 = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False
-            )
-            out5 = interpolate(
-                x,
-                scale_factor=scale_tensor,
-                mode='bicubic',
-                align_corners=False,
-            )
-
-            exe = base.Executor(place)
-            exe.run(base.default_startup_program())
-            results = exe.run(
-                base.default_main_program(),
-                feed={
-                    "x": x_data,
-                    "dim": dim_data,
-                    "shape_tensor": shape_data,
-                    "actual_size": actual_size_data,
-                    "scale_tensor": scale_data,
-                },
-                fetch_list=[out1, out2, out3, out4, out5],
-                return_numpy=True,
-            )
-
-            expect_res = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False
-            )
-            for res in results:
-                np.testing.assert_allclose(res, expect_res, rtol=1e-05)
-
-        with base.dygraph.guard():
-            x = paddle.to_tensor(x_data)
-            interp = interpolate(
-                x, size=[12, 12], mode='bicubic', align_corners=False
-            )
-            dy_result = interp.numpy()
-            expect = bicubic_interp_np(
-                x_data, out_h=12, out_w=12, align_corners=False
-            )
-            np.testing.assert_allclose(dy_result, expect, rtol=1e-05)
-
-
-class TestBicubicOpError(unittest.TestCase):
-    @test_with_pir_api
-    def test_errors(self):
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            # the input of interpoalte must be Variable.
-            x1 = base.create_lod_tensor(
-                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
-            )
-            self.assertRaises(TypeError, interpolate, x1)
-
-            def test_mode_type():
-                # mode must be "BILINEAR" "TRILINEAR" "NEAREST" "BICUBIC"
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-
-                out = interpolate(
-                    x, size=[12, 12], mode='UNKONWN', align_corners=False
-                )
-
-            def test_input_shape():
-                x = paddle.static.data(name="x", shape=[2], dtype="float32")
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False
-                )
-
-            def test_size_shape():
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                out = interpolate(
-                    x, size=[12], mode='BICUBIC', align_corners=False
-                )
-
-            def test_align_corcers():
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                interpolate(x, size=[12, 12], mode='BICUBIC', align_corners=3)
-
-            def test_out_shape():
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                out = interpolate(
-                    x, size=[12], mode='bicubic', align_corners=False
-                )
-
-            def test_attr_data_format():
-                # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = paddle.static.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32"
-                )
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NHWC',
-                )
-
-            def test_actual_shape():
-                # the actual_shape  must be Variable.
-                x = base.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
-                )
-                out = interpolate(
-                    x, size=[12, 12], mode='BICUBIC', align_corners=False
-                )
-
-            def test_scale_value():
-                # the scale must be greater than zero.
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='BICUBIC',
-                    align_corners=False,
-                    scale_factor=-2.0,
-                )
-
-            def test_attr_5D_input():
-                # for 5-D input, data_format only can be NCDHW or NDHWC
-                input = paddle.static.data(
-                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32"
-                )
-                out = interpolate(
-                    input,
-                    size=[4, 8, 4, 5],
-                    mode='trilinear',
-                    data_format='NDHWC',
-                )
-
-            def test_scale_type():
-                # the scale must be greater than zero.
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                scale = base.create_lod_tensor(
-                    np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
-                )
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=scale,
-                )
-
-            def test_align_mode():
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='nearest',
-                    align_corners=False,
-                    align_mode=2,
-                    scale_factor=1.0,
-                )
-
-            def test_outshape_and_scale():
-                x = paddle.static.data(
-                    name="x", shape=[2, 3, 6, 6], dtype="float32"
-                )
-                out = interpolate(
-                    x,
-                    size=None,
-                    mode='bicubic',
-                    align_corners=False,
-                    scale_factor=None,
-                )
-
-            self.assertRaises(ValueError, test_mode_type)
-            self.assertRaises(ValueError, test_input_shape)
-            self.assertRaises(ValueError, test_size_shape)
-            self.assertRaises(TypeError, test_align_corcers)
-            self.assertRaises(ValueError, test_attr_data_format)
-            self.assertRaises(TypeError, test_actual_shape)
-            self.assertRaises(ValueError, test_scale_value)
-            self.assertRaises(ValueError, test_out_shape)
-            self.assertRaises(ValueError, test_attr_5D_input)
-            self.assertRaises(TypeError, test_scale_type)
-            self.assertRaises(ValueError, test_align_mode)
-            self.assertRaises(ValueError, test_outshape_and_scale)
-
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_bilinear_interp_op.py b/test/legacy_test/test_bilinear_interp_op.py
deleted file mode 100755
index 9409762d881c00..00000000000000
--- a/test/legacy_test/test_bilinear_interp_op.py
+++ /dev/null
@@ -1,520 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-from paddle.base import core
-
-paddle.enable_static()
-
-
-def bilinear_interp_np(
-    input,
-    out_h,
-    out_w,
-    out_size=None,
-    actual_shape=None,
-    align_corners=True,
-    align_mode=0,
-    data_layout='NCHW',
-):
-    """bilinear interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    batch_size, channel, in_h, in_w = input.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        if align_corners:
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if align_corners:
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-
-    for i in range(out_h):
-        if align_mode == 0 and not align_corners:
-            h = int(ratio_h * (i + 0.5) - 0.5)
-        else:
-            h = int(ratio_h * i)
-
-        h = max(0, h)
-        hid = 1 if h < in_h - 1 else 0
-        if align_mode == 0 and not align_corners:
-            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
-            h1lambda = idx_src_h - h
-        else:
-            h1lambda = ratio_h * i - h
-        h2lambda = 1.0 - h1lambda
-        for j in range(out_w):
-            if align_mode == 0 and not align_corners:
-                w = int(ratio_w * (j + 0.5) - 0.5)
-            else:
-                w = int(ratio_w * j)
-            w = max(0, w)
-            wid = 1 if w < in_w - 1 else 0
-            if align_mode == 0 and not align_corners:
-                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
-                w1lambda = idx_src_w - w
-            else:
-                w1lambda = ratio_w * j - w
-            w2lambda = 1.0 - w1lambda
-
-            out[:, :, i, j] = h2lambda * (
-                w2lambda * input[:, :, h, w]
-                + w1lambda * input[:, :, h, w + wid]
-            ) + h1lambda * (
-                w2lambda * input[:, :, h + hid, w]
-                + w1lambda * input[:, :, h + hid, w + wid]
-            )
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(input.dtype)
-
-
-class TestBilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        # NOTE(dev): some AsDispensible input is not used under imperative mode.
-        # Skip check_dygraph while found them in Inputs.
-        input_np = np.random.random(self.input_shape).astype("float64")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-            self.data_layout,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': self.data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase4(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase5(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase6(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([65, 33]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpSame(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpActualShape(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpDataLayout(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 5, 5, 3]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-        self.data_layout = "NHWC"
-
-
-class TestBilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape
-        ).astype("uint8")
-
-        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_dygraph=False
-        )
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.0
-        self.out_size = np.array([6, 15]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-class TestBilinearInterpScale1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpScale2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpScale3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestBilinearInterpZero(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestBilinearInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float64")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(
-                    ("x" + str(index), np.ones(1).astype('int32') * ele)
-                )
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = bilinear_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-        )
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 5]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.0
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# scale is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_generate_proposals_op.py b/test/legacy_test/test_generate_proposals_op.py
deleted file mode 100644
index 901d009effc5bc..00000000000000
--- a/test/legacy_test/test_generate_proposals_op.py
+++ /dev/null
@@ -1,452 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest
-from test_anchor_generator_op import anchor_generator_in_python
-
-import paddle
-
-
-def generate_proposals_in_python(
-    scores,
-    bbox_deltas,
-    im_info,
-    anchors,
-    variances,
-    pre_nms_topN,
-    post_nms_topN,
-    nms_thresh,
-    min_size,
-    eta,
-):
-    all_anchors = anchors.reshape(-1, 4)
-    rois = np.empty((0, 5), dtype=np.float32)
-    roi_probs = np.empty((0, 1), dtype=np.float32)
-
-    rpn_rois = []
-    rpn_roi_probs = []
-    rois_num = []
-    num_images = scores.shape[0]
-    for img_idx in range(num_images):
-        img_i_boxes, img_i_probs = proposal_for_one_image(
-            im_info[img_idx, :],
-            all_anchors,
-            variances,
-            bbox_deltas[img_idx, :, :, :],
-            scores[img_idx, :, :, :],
-            pre_nms_topN,
-            post_nms_topN,
-            nms_thresh,
-            min_size,
-            eta,
-        )
-        rois_num.append(img_i_probs.shape[0])
-        rpn_rois.append(img_i_boxes)
-        rpn_roi_probs.append(img_i_probs)
-
-    return rpn_rois, rpn_roi_probs, rois_num
-
-
-def proposal_for_one_image(
-    im_info,
-    all_anchors,
-    variances,
-    bbox_deltas,
-    scores,
-    pre_nms_topN,
-    post_nms_topN,
-    nms_thresh,
-    min_size,
-    eta,
-):
-    # Transpose and reshape predicted bbox transformations to get them
-    # into the same order as the anchors:
-    #   - bbox deltas will be (4 * A, H, W) format from conv output
-    #   - transpose to (H, W, 4 * A)
-    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
-    #     in slowest to fastest order to match the enumerated anchors
-    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
-    all_anchors = all_anchors.reshape(-1, 4)
-    variances = variances.reshape(-1, 4)
-    # Same story for the scores:
-    #   - scores are (A, H, W) format from conv output
-    #   - transpose to (H, W, A)
-    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
-    #     to match the order of anchors and bbox_deltas
-    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
-
-    # sort all (proposal, score) pairs by score from highest to lowest
-    # take top pre_nms_topN (e.g. 6000)
-    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
-        order = np.argsort(-scores.squeeze())
-    else:
-        # Avoid sorting possibly large arrays;
-        # First partition to get top K unsorted
-        # and then sort just those
-        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
-        order = np.argsort(-scores[inds].squeeze())
-        order = inds[order]
-    scores = scores[order, :]
-    bbox_deltas = bbox_deltas[order, :]
-    all_anchors = all_anchors[order, :]
-    proposals = box_coder(all_anchors, bbox_deltas, variances)
-    # clip proposals to image (may result in proposals with zero area
-    # that will be removed in the next step)
-    proposals = clip_tiled_boxes(proposals, im_info[:2])
-    # remove predicted boxes with height or width < min_size
-    keep = filter_boxes(proposals, min_size, im_info)
-    if len(keep) == 0:
-        proposals = np.zeros((1, 4)).astype('float32')
-        scores = np.zeros((1, 1)).astype('float32')
-        return proposals, scores
-    proposals = proposals[keep, :]
-    scores = scores[keep, :]
-
-    # apply loose nms (e.g. threshold = 0.7)
-    # take post_nms_topN (e.g. 1000)
-    # return the top proposals
-    if nms_thresh > 0:
-        keep = nms(
-            boxes=proposals, scores=scores, nms_threshold=nms_thresh, eta=eta
-        )
-        if post_nms_topN > 0 and post_nms_topN < len(keep):
-            keep = keep[:post_nms_topN]
-        proposals = proposals[keep, :]
-        scores = scores[keep, :]
-
-    return proposals, scores
-
-
-def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
-    """
-    Decode proposals by anchors and bbox_deltas from RPN
-    """
-    offset = 1 if pixel_offset else 0
-    # proposals: xmin, ymin, xmax, ymax
-    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
-
-    # anchor_loc: width, height, center_x, center_y
-    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
-
-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset
-    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
-    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
-
-    # predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
-    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
-    if variances is not None:
-        for i in range(bbox_deltas.shape[0]):
-            pred_bbox[i, 0] = (
-                variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[i, 0]
-                + anchor_loc[i, 2]
-            )
-            pred_bbox[i, 1] = (
-                variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[i, 1]
-                + anchor_loc[i, 3]
-            )
-            pred_bbox[i, 2] = (
-                math.exp(
-                    min(
-                        variances[i, 2] * bbox_deltas[i, 2],
-                        math.log(1000 / 16.0),
-                    )
-                )
-                * anchor_loc[i, 0]
-            )
-            pred_bbox[i, 3] = (
-                math.exp(
-                    min(
-                        variances[i, 3] * bbox_deltas[i, 3],
-                        math.log(1000 / 16.0),
-                    )
-                )
-                * anchor_loc[i, 1]
-            )
-    else:
-        for i in range(bbox_deltas.shape[0]):
-            pred_bbox[i, 0] = (
-                bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[i, 2]
-            )
-            pred_bbox[i, 1] = (
-                bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[i, 3]
-            )
-            pred_bbox[i, 2] = (
-                math.exp(min(bbox_deltas[i, 2], math.log(1000 / 16.0)))
-                * anchor_loc[i, 0]
-            )
-            pred_bbox[i, 3] = (
-                math.exp(min(bbox_deltas[i, 3], math.log(1000 / 16.0)))
-                * anchor_loc[i, 1]
-            )
-    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
-    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset
-
-    return proposals
-
-
-def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
-    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
-    has shape (N, 4 * num_tiled_boxes)."""
-    assert (
-        boxes.shape[1] % 4 == 0
-    ), f'boxes.shape[1] is {boxes.shape[1]:d}, but must be divisible by 4.'
-    offset = 1 if pixel_offset else 0
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(
-        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0
-    )
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(
-        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0
-    )
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(
-        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0
-    )
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(
-        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0
-    )
-    return boxes
-
-
-def filter_boxes(boxes, min_size, im_info, pixel_offset=True):
-    """Only keep boxes with both sides >= min_size and center within the image."""
-    # Scale min_size to match image scale
-    im_scale = im_info[2]
-    min_size = max(min_size, 1.0)
-    offset = 1 if pixel_offset else 0
-    ws = boxes[:, 2] - boxes[:, 0] + offset
-    hs = boxes[:, 3] - boxes[:, 1] + offset
-    if pixel_offset:
-        ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
-        hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
-        x_ctr = boxes[:, 0] + ws / 2.0
-        y_ctr = boxes[:, 1] + hs / 2.0
-        keep = np.where(
-            (ws_orig_scale >= min_size)
-            & (hs_orig_scale >= min_size)
-            & (x_ctr < im_info[1])
-            & (y_ctr < im_info[0])
-        )[0]
-    else:
-        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
-    return keep
-
-
-def iou(box_a, box_b, pixel_offset=True):
-    """
-    Apply intersection-over-union overlap between box_a and box_b
-    """
-    xmin_a = min(box_a[0], box_a[2])
-    ymin_a = min(box_a[1], box_a[3])
-    xmax_a = max(box_a[0], box_a[2])
-    ymax_a = max(box_a[1], box_a[3])
-
-    xmin_b = min(box_b[0], box_b[2])
-    ymin_b = min(box_b[1], box_b[3])
-    xmax_b = max(box_b[0], box_b[2])
-    ymax_b = max(box_b[1], box_b[3])
-    offset = 1 if pixel_offset else 0
-    area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset)
-    area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset)
-    if area_a <= 0 and area_b <= 0:
-        return 0.0
-
-    xa = max(xmin_a, xmin_b)
-    ya = max(ymin_a, ymin_b)
-    xb = min(xmax_a, xmax_b)
-    yb = min(ymax_a, ymax_b)
-
-    inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0)
-
-    iou_ratio = inter_area / (area_a + area_b - inter_area)
-
-    return iou_ratio
-
-
-def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
-    """Apply non-maximum suppression at test time to avoid detecting too many
-    overlapping bounding boxes for a given object.
-    Args:
-        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
-        scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        nms_threshold: (float) The overlap thresh for suppressing unnecessary
-            boxes.
-        eta: (float) The parameter for adaptive NMS.
-    Return:
-        The indices of the kept boxes with respect to num_priors.
-    """
-    all_scores = copy.deepcopy(scores)
-    all_scores = all_scores.flatten()
-
-    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
-    sorted_scores = all_scores[sorted_indices]
-    selected_indices = []
-    adaptive_threshold = nms_threshold
-    for i in range(sorted_scores.shape[0]):
-        idx = sorted_indices[i]
-        keep = True
-        for k in range(len(selected_indices)):
-            if keep:
-                kept_idx = selected_indices[k]
-                overlap = iou(
-                    boxes[idx], boxes[kept_idx], pixel_offset=pixel_offset
-                )
-                keep = True if overlap <= adaptive_threshold else False
-            else:
-                break
-        if keep:
-            selected_indices.append(idx)
-        if keep and eta < 1 and adaptive_threshold > 0.5:
-            adaptive_threshold *= eta
-    return selected_indices
-
-
-class TestGenerateProposalsOp(OpTest):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {
-            'Scores': self.scores,
-            'BboxDeltas': self.bbox_deltas,
-            'ImInfo': self.im_info.astype(np.float32),
-            'Anchors': self.anchors,
-            'Variances': self.variances,
-        }
-
-        self.attrs = {
-            'pre_nms_topN': self.pre_nms_topN,
-            'post_nms_topN': self.post_nms_topN,
-            'nms_thresh': self.nms_thresh,
-            'min_size': self.min_size,
-            'eta': self.eta,
-        }
-
-        self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
-        }
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def setUp(self):
-        self.op_type = "generate_proposals"
-        self.set_data()
-
-    def init_test_params(self):
-        self.pre_nms_topN = 12000  # train 12000, test 2000
-        self.post_nms_topN = 5000  # train 6000, test 1000
-        self.nms_thresh = 0.7
-        self.min_size = 3.0
-        self.eta = 1.0
-
-    def init_test_input(self):
-        batch_size = 1
-        input_channels = 20
-        layer_h = 16
-        layer_w = 16
-        input_feat = np.random.random(
-            (batch_size, input_channels, layer_h, layer_w)
-        ).astype('float32')
-        self.anchors, self.variances = anchor_generator_in_python(
-            input_feat=input_feat,
-            anchor_sizes=[16.0, 32.0],
-            aspect_ratios=[0.5, 1.0],
-            variances=[1.0, 1.0, 1.0, 1.0],
-            stride=[16.0, 16.0],
-            offset=0.5,
-        )
-        self.im_info = np.array(
-            [[64.0, 64.0, 8.0]]
-        )  # im_height, im_width, scale
-        num_anchors = self.anchors.shape[2]
-        self.scores = np.random.random(
-            (batch_size, num_anchors, layer_h, layer_w)
-        ).astype('float32')
-        self.bbox_deltas = np.random.random(
-            (batch_size, num_anchors * 4, layer_h, layer_w)
-        ).astype('float32')
-
-    def init_test_output(self):
-        (
-            self.rpn_rois,
-            self.rpn_roi_probs,
-            self.rois_num,
-        ) = generate_proposals_in_python(
-            self.scores,
-            self.bbox_deltas,
-            self.im_info,
-            self.anchors,
-            self.variances,
-            self.pre_nms_topN,
-            self.post_nms_topN,
-            self.nms_thresh,
-            self.min_size,
-            self.eta,
-        )
-
-
-class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
-    def set_data(self):
-        self.init_test_params()
-        self.init_test_input()
-        self.init_test_output()
-        self.inputs = {
-            'Scores': self.scores,
-            'BboxDeltas': self.bbox_deltas,
-            'ImInfo': self.im_info.astype(np.float32),
-            'Anchors': self.anchors,
-            'Variances': self.variances,
-        }
-
-        self.attrs = {
-            'pre_nms_topN': self.pre_nms_topN,
-            'post_nms_topN': self.post_nms_topN,
-            'nms_thresh': self.nms_thresh,
-            'min_size': self.min_size,
-            'eta': self.eta,
-            'return_rois_num': True,
-        }
-
-        self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
-            'RpnRoisNum': (np.asarray(self.rois_num, dtype=np.int32)),
-        }
-
-
-class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
-    def init_test_params(self):
-        self.pre_nms_topN = 12000  # train 12000, test 2000
-        self.post_nms_topN = 5000  # train 6000, test 1000
-        self.nms_thresh = 0.7
-        self.min_size = 1000.0
-        self.eta = 1.0
-
-
-if __name__ == '__main__':
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_generate_proposals_v2_op.py b/test/legacy_test/test_generate_proposals_v2_op.py
index 87e9e6c60fe7d6..b0eaf05ea6a753 100644
--- a/test/legacy_test/test_generate_proposals_v2_op.py
+++ b/test/legacy_test/test_generate_proposals_v2_op.py
@@ -12,16 +12,182 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+import math
 import unittest
 
 import numpy as np
 from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposals_op import box_coder, clip_tiled_boxes, nms
 
 import paddle
 
 
+def box_coder(all_anchors, bbox_deltas, variances, pixel_offset=True):
+    """
+    Decode proposals by anchors and bbox_deltas from RPN
+    """
+    offset = 1 if pixel_offset else 0
+    # proposals: xmin, ymin, xmax, ymax
+    proposals = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    # anchor_loc: width, height, center_x, center_y
+    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)
+
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + offset
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + offset
+    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
+    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]
+
+    # predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height
+    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
+    if variances is not None:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = (
+                variances[i, 0] * bbox_deltas[i, 0] * anchor_loc[i, 0]
+                + anchor_loc[i, 2]
+            )
+            pred_bbox[i, 1] = (
+                variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[i, 1]
+                + anchor_loc[i, 3]
+            )
+            pred_bbox[i, 2] = (
+                math.exp(
+                    min(
+                        variances[i, 2] * bbox_deltas[i, 2],
+                        math.log(1000 / 16.0),
+                    )
+                )
+                * anchor_loc[i, 0]
+            )
+            pred_bbox[i, 3] = (
+                math.exp(
+                    min(
+                        variances[i, 3] * bbox_deltas[i, 3],
+                        math.log(1000 / 16.0),
+                    )
+                )
+                * anchor_loc[i, 1]
+            )
+    else:
+        for i in range(bbox_deltas.shape[0]):
+            pred_bbox[i, 0] = (
+                bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[i, 2]
+            )
+            pred_bbox[i, 1] = (
+                bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[i, 3]
+            )
+            pred_bbox[i, 2] = (
+                math.exp(min(bbox_deltas[i, 2], math.log(1000 / 16.0)))
+                * anchor_loc[i, 0]
+            )
+            pred_bbox[i, 3] = (
+                math.exp(min(bbox_deltas[i, 3], math.log(1000 / 16.0)))
+                * anchor_loc[i, 1]
+            )
+    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
+    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - offset
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - offset
+
+    return proposals
+
+
+def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
+    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
+    has shape (N, 4 * num_tiled_boxes)."""
+    assert (
+        boxes.shape[1] % 4 == 0
+    ), f'boxes.shape[1] is {boxes.shape[1]:d}, but must be divisible by 4.'
+    offset = 1 if pixel_offset else 0
+    # x1 >= 0
+    boxes[:, 0::4] = np.maximum(
+        np.minimum(boxes[:, 0::4], im_shape[1] - offset), 0
+    )
+    # y1 >= 0
+    boxes[:, 1::4] = np.maximum(
+        np.minimum(boxes[:, 1::4], im_shape[0] - offset), 0
+    )
+    # x2 < im_shape[1]
+    boxes[:, 2::4] = np.maximum(
+        np.minimum(boxes[:, 2::4], im_shape[1] - offset), 0
+    )
+    # y2 < im_shape[0]
+    boxes[:, 3::4] = np.maximum(
+        np.minimum(boxes[:, 3::4], im_shape[0] - offset), 0
+    )
+    return boxes
+
+
+def iou(box_a, box_b, pixel_offset=True):
+    """
+    Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+    offset = 1 if pixel_offset else 0
+    area_a = (ymax_a - ymin_a + offset) * (xmax_a - xmin_a + offset)
+    area_b = (ymax_b - ymin_b + offset) * (xmax_b - xmin_b + offset)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+
+    inter_area = max(xb - xa + offset, 0.0) * max(yb - ya + offset, 0.0)
+
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+
+    return iou_ratio
+
+
+def nms(boxes, scores, nms_threshold, eta=1.0, pixel_offset=True):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        eta: (float) The parameter for adaptive NMS.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(
+                    boxes[idx], boxes[kept_idx], pixel_offset=pixel_offset
+                )
+                keep = True if overlap <= adaptive_threshold else False
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
 def python_generate_proposals_v2(
     scores,
     bbox_deltas,
diff --git a/test/legacy_test/test_linear_interp_op.py b/test/legacy_test/test_linear_interp_op.py
deleted file mode 100755
index f5bd1e7e103d10..00000000000000
--- a/test/legacy_test/test_linear_interp_op.py
+++ /dev/null
@@ -1,381 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import platform
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-import paddle
-from paddle import base
-from paddle.base import core
-from paddle.pir_utils import test_with_pir_api
-
-
-def linear_interp_np(
-    input,
-    out_w,
-    out_size=None,
-    actual_shape=None,
-    align_corners=True,
-    align_mode=0,
-    data_layout='NCHW',
-):
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 2, 1))  # NHWC => NCHW
-    if out_size is not None:
-        out_w = out_size[0]
-    if actual_shape is not None:
-        out_w = actual_shape[0]
-    batch_size, channel, in_w = input.shape
-
-    ratio_w = 0.0
-    if out_w > 1:
-        if align_corners:
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_w))
-
-    for j in range(out_w):
-        if align_mode == 0 and not align_corners:
-            w = int(ratio_w * (j + 0.5) - 0.5)
-        else:
-            w = int(ratio_w * j)
-        w = max(0, w)
-        wid = 1 if w < in_w - 1 else 0
-
-        if align_mode == 0 and not align_corners:
-            idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
-            w1lambda = idx_src_w - w
-        else:
-            w1lambda = ratio_w * j - w
-        w2lambda = 1.0 - w1lambda
-
-        out[:, :, j] = (
-            w2lambda * input[:, :, w] + w1lambda * input[:, :, w + wid]
-        )
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 1))  # NCHW => NHWC
-
-    return out.astype(input.dtype)
-
-
-class TestLinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "linear_interp"
-        input_np = np.random.random(self.input_shape).astype("float64")
-
-        if self.data_layout == "NCHW":
-            in_w = self.input_shape[2]
-        else:
-            in_w = self.input_shape[1]
-
-        if self.scale > 0:
-            out_w = int(in_w * self.scale)
-        else:
-            out_w = self.out_w
-
-        output_np = linear_interp_np(
-            input_np,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-            self.data_layout,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': self.data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if platform.system() == "Linux":
-            self.check_output(atol=1e-7, check_dygraph=False)
-        else:
-            self.check_output(atol=1e-5, check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'linear'
-        self.input_shape = [1, 3, 100]
-        self.out_w = 50
-        self.scale = 0.0
-        self.out_size = np.array(
-            [
-                50,
-            ]
-        ).astype("int32")
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestLinearInterpOpDataLayout(TestLinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'linear'
-        self.input_shape = [1, 3, 100]
-        self.out_w = 50
-        self.scale = 0.0
-        self.out_size = np.array(
-            [
-                50,
-            ]
-        ).astype("int32")
-        self.align_corners = False
-        self.align_mode = 1
-        self.data_layout = 'NHWC'
-
-
-class TestLinearInterpOpAlignMode(TestLinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'linear'
-        self.input_shape = [1, 3, 100]
-        self.out_w = 50
-        self.scale = 0.0
-        self.out_size = np.array(
-            [
-                50,
-            ]
-        ).astype("int32")
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestLinearInterpOpScale(TestLinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'linear'
-        self.input_shape = [1, 3, 100]
-        self.out_w = 50
-        self.scale = 0.5
-        self.out_size = np.array(
-            [
-                50,
-            ]
-        ).astype("int32")
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "linear_interp"
-        input_np = np.random.random(self.input_shape).astype("float64")
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-
-        if self.data_layout == "NCHW":
-            in_w = self.input_shape[2]
-        else:
-            in_w = self.input_shape[1]
-
-        if self.scale > 0:
-            out_w = int(in_w * self.scale)
-        else:
-            out_w = self.out_w
-
-        output_np = linear_interp_np(
-            input_np,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-            self.data_layout,
-        )
-
-        self.inputs = {'X': input_np}
-        if self.out_size is not None and self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.actual_shape is not None and self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.actual_shape
-        else:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(
-                    ("x" + str(index), np.ones(1).astype('int32') * ele)
-                )
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs = {
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': self.data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-
-class TestLinearInterpOpAPI2_0(unittest.TestCase):
-    def test_case(self):
-        # dygraph
-        x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.Upsample(
-            size=[
-                64,
-            ],
-            mode='linear',
-            align_mode=1,
-            align_corners=False,
-            data_format='NCW',
-        )
-        with base.dygraph.guard():
-            x = paddle.to_tensor(x_data)
-            interp = us_1(x)
-
-            expect = linear_interp_np(
-                x_data, out_w=64, align_mode=1, align_corners=False
-            )
-
-            np.testing.assert_allclose(interp.numpy(), expect, rtol=1e-05)
-
-
-class TestResizeLinearOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "linear_interp"
-        input_np = np.random.random(self.input_shape).astype("uint8")
-
-        if self.scale > 0:
-            out_w = int(self.input_shape[3] * self.scale)
-        else:
-            out_w = self.out_w
-
-        output_np = linear_interp_np(
-            input_np,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if platform.system() == "Linux":
-            self.check_output_with_place(
-                place=core.CPUPlace(), atol=1e-7, check_dygraph=False
-            )
-        else:
-            self.check_output_with_place(
-                place=core.CPUPlace(), atol=1e-5, check_dygraph=False
-            )
-
-    def init_test_case(self):
-        self.interp_method = 'linear'
-        self.input_shape = [2, 3, 100]
-        self.out_w = 50
-        self.scale = 0.0
-        self.out_size = np.array(
-            [
-                50,
-            ]
-        ).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestLinearInterpOpError(unittest.TestCase):
-    @test_with_pir_api
-    def test_error(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-
-            def input_shape_error():
-                x1 = paddle.static.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.Upsample(
-                    size=[
-                        256,
-                    ],
-                    data_format='NCW',
-                    mode='linear',
-                )
-                out1_res = out1(x1)
-
-            def data_format_error():
-                x2 = paddle.static.data(
-                    name="x2", shape=[1, 3, 128], dtype="float32"
-                )
-                out2 = paddle.nn.Upsample(
-                    size=[
-                        256,
-                    ],
-                    data_format='NHWCD',
-                    mode='linear',
-                )
-                out2_res = out2(x2)
-
-            def out_shape_error():
-                x3 = paddle.static.data(
-                    name="x3", shape=[1, 3, 128], dtype="float32"
-                )
-                out3 = paddle.nn.Upsample(
-                    size=[
-                        256,
-                        256,
-                    ],
-                    data_format='NHWC',
-                    mode='linear',
-                )
-                out3_res = out3(x3)
-
-            self.assertRaises(ValueError, input_shape_error)
-            self.assertRaises(ValueError, data_format_error)
-            self.assertRaises(ValueError, out_shape_error)
-        paddle.disable_static()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_lookup_table_op.py b/test/legacy_test/test_lookup_table_op.py
deleted file mode 100644
index 42c9844ddd9853..00000000000000
--- a/test/legacy_test/test_lookup_table_op.py
+++ /dev/null
@@ -1,437 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op import Operator
-from op_test import (
-    OpTest,
-    check_out_dtype,
-    skip_check_grad_ci,
-)
-
-import paddle.nn.functional as F
-from paddle.base import core
-
-
-class TestLookupTableOp(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(0, 17, 4).astype("int64")
-        ids_expand = np.expand_dims(ids, axis=1)
-        self.inputs = {'W': table, 'Ids': ids_expand}
-        self.outputs = {'Out': table[ids]}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
-
-
-class TestLookupTableOpWithTensorIds(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5, 1)).astype(
-            "int64"
-        )
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
-
-
-@skip_check_grad_ci(
-    reason="Since paddings are not trainable and fixed in forward,"
-    "the gradient of paddings makes no sense and we don't "
-    "test the gradient here."
-)
-class TestLookupTableOpWithPadding(TestLookupTableOp):
-    def test_check_output(self):
-        ids = np.squeeze(self.inputs['Ids'])
-        padding_idx = np.random.choice(ids, 1)[0]
-        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output(check_cinn=True)
-
-
-@skip_check_grad_ci(
-    reason="Since paddings are not trainable and fixed in forward,"
-    "the gradient of paddings makes no sense and we don't "
-    "test the gradient here."
-)
-class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
-    def test_check_output(self):
-        ids = self.inputs['Ids']
-        flatten_idx = ids.flatten()
-        padding_idx = np.random.choice(flatten_idx, 1)[0]
-        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': padding_idx}
-        self.check_output(check_cinn=True)
-
-
-class TestLookupTableWIsSelectedRows(unittest.TestCase):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def prepare_w(self, scope, place):
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 12
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("float32")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-    def create_out_tensor(self, scope, place):
-        return scope.var('Out').get_tensor()
-
-    def check_result(self, ids_array, result_array):
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        ids_array = self.prepare_ids(scope, place)
-
-        self.prepare_w(scope, place)
-
-        out_tensor = self.create_out_tensor(scope, place)
-
-        # create and run lookup_table operator
-        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-
-        self.check_result(ids_array, result_array)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestLookupTableWithTensorIdsWIsSelectedRows(
-    TestLookupTableWIsSelectedRows
-):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(low=0, high=6, size=(2, 4, 3, 1)).astype(
-            "int64"
-        )
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in np.ndenumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-
-class TestLookupTableOpInt8(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.randint(low=-128, high=127, size=(17, 31)).astype(
-            "int8"
-        )
-        ids = np.random.randint(0, 17, 4).astype("int64")
-        ids_expand = np.expand_dims(ids, axis=1)
-        self.inputs = {'W': table, 'Ids': ids_expand}
-        self.outputs = {'Out': table[ids]}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        # since int8 type only be used in test and inference, there is
-        # no gradient implement, so we don't need to test it
-        pass
-
-
-class TestLookupTableOpWithTensorIdsInt8(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.randint(low=-128, high=127, size=(17, 31)).astype(
-            "int8"
-        )
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5, 1)).astype(
-            "int64"
-        )
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        # since int8 type only be used in test and inference, there is
-        # no gradient implement, so we don't need to test it
-        pass
-
-
-class TestLookupTableOpWithPaddingInt8(TestLookupTableOpInt8):
-    def test_check_output(self):
-        ids = np.squeeze(self.inputs['Ids'])
-        padding_idx = np.random.choice(ids, 1)[0]
-        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableOpWithTensorIdsAndPaddingInt8(
-    TestLookupTableOpWithTensorIdsInt8
-):
-    def test_check_output(self):
-        ids = self.inputs['Ids']
-        flatten_idx = ids.flatten()
-        padding_idx = np.random.choice(flatten_idx, 1)[0]
-        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': padding_idx}
-        self.check_output(check_cinn=True)
-
-    def test_check_grad(self):
-        # Since paddings are not trainable and fixed in forward, the gradient of
-        # paddings makes no sense and we don't test the gradient here.
-        pass
-
-
-class TestLookupTableWIsSelectedRowsInt8(unittest.TestCase):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def prepare_w(self, scope, place):
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 12
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("int8")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-    def create_out_tensor(self, scope, place):
-        return scope.var('Out').get_tensor()
-
-    def check_result(self, ids_array, result_array):
-        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        ids_array = self.prepare_ids(scope, place)
-
-        self.prepare_w(scope, place)
-
-        out_tensor = self.create_out_tensor(scope, place)
-
-        # create and run lookup_table operator
-        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-
-        self.check_result(ids_array, result_array)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestLookupTableWithTensorIdsWIsSelectedRowsInt8(
-    TestLookupTableWIsSelectedRowsInt8
-):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(low=0, high=6, size=(2, 4, 3, 1)).astype(
-            "int64"
-        )
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in np.ndenumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-
-@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
-class TestLookupTableOpInt16(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.randint(low=-128, high=127, size=(17, 31)).astype(
-            "int16"
-        )
-        ids = np.random.randint(0, 17, 4).astype("int64")
-        ids_expand = np.expand_dims(ids, axis=1)
-        self.inputs = {'W': table, 'Ids': ids_expand}
-        self.outputs = {'Out': table[ids]}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-
-@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
-class TestLookupTableOpWithTensorIdsInt16(OpTest):
-    def setUp(self):
-        self.op_type = "lookup_table"
-        table = np.random.randint(low=-128, high=127, size=(17, 31)).astype(
-            "int16"
-        )
-        ids = np.random.randint(low=0, high=17, size=(2, 4, 5, 1)).astype(
-            "int64"
-        )
-        self.inputs = {'W': table, 'Ids': ids}
-        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-
-@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
-class TestLookupTableOpWithPaddingInt16(TestLookupTableOpInt16):
-    def test_check_output(self):
-        ids = np.squeeze(self.inputs['Ids'])
-        padding_idx = np.random.choice(ids, 1)[0]
-        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output(check_cinn=True)
-
-
-@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
-class TestLookupTableOpWithTensorIdsAndPaddingInt16(
-    TestLookupTableOpWithTensorIdsInt16
-):
-    def test_check_output(self):
-        ids = self.inputs['Ids']
-        flatten_idx = ids.flatten()
-        padding_idx = np.random.choice(flatten_idx, 1)[0]
-        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
-        self.attrs = {'padding_idx': padding_idx}
-        self.check_output(check_cinn=True)
-
-
-class TestLookupTableWIsSelectedRowsInt16(unittest.TestCase):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def prepare_w(self, scope, place):
-        rows = [0, 1, 2, 3, 4, 5, 6]
-        row_numel = 12
-
-        w_selected_rows = scope.var('W').get_selected_rows()
-        w_selected_rows.set_height(len(rows))
-        w_selected_rows.set_rows(rows)
-        w_array = np.ones((len(rows), row_numel)).astype("int16")
-        for i in range(len(rows)):
-            w_array[i] *= i
-        w_tensor = w_selected_rows.get_tensor()
-        w_tensor.set(w_array, place)
-
-    def create_out_tensor(self, scope, place):
-        return scope.var('Out').get_tensor()
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in enumerate(ids_array):
-            assert (row[0] == result_array[idx]).all()
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-
-        ids_array = self.prepare_ids(scope, place)
-
-        self.prepare_w(scope, place)
-
-        out_tensor = self.create_out_tensor(scope, place)
-
-        # create and run lookup_table operator
-        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
-        lookup_table.run(scope, place)
-
-        # get result from Out
-        result_array = np.array(out_tensor)
-
-        self.check_result(ids_array, result_array)
-
-    def test_w_is_selected_rows(self):
-        places = [core.CPUPlace()]
-        # currently only support CPU
-        for place in places:
-            self.check_with_place(place)
-
-
-class TestLookupTableWithTensorIdsWIsSelectedRowsInt16(
-    TestLookupTableWIsSelectedRowsInt16
-):
-    def prepare_ids(self, scope, place):
-        ids_tensor = scope.var('Ids').get_tensor()
-        ids_array = np.random.randint(low=0, high=6, size=(2, 4, 3, 1)).astype(
-            "int64"
-        )
-        ids_tensor.set(ids_array, place)
-        return ids_array
-
-    def check_result(self, ids_array, result_array):
-        for idx, row in np.ndenumerate(ids_array):
-            assert (row == result_array[idx]).all()
-
-
-class TestOutDtype(unittest.TestCase):
-    def test_dtype(self):
-        api_fn = F.embedding
-        check_out_dtype(
-            api_fn,
-            in_specs=[([10, 16], 'int64'), ([100, 64],)],
-            expect_dtypes=['float32', 'float64'],
-            target_index=1,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py
deleted file mode 100644
index 2d264bff97c308..00000000000000
--- a/test/legacy_test/test_matmul_op.py
+++ /dev/null
@@ -1,249 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, paddle_static_guard
-
-import paddle
-from paddle import base
-from paddle.pir_utils import test_with_pir_api
-
-
-def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
-    BATCH_SIZE = 2
-    M = 3
-    N = 4
-    K = 5
-    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
-        K = 1
-    if dim_X == 1:
-        if transpose_X:
-            shape_X = [M]
-        else:
-            shape_X = [K]
-    if dim_Y == 1:
-        if transpose_Y:
-            shape_Y = [N]
-        else:
-            shape_Y = [K]
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K, M]
-        else:
-            shape_X = [M, K]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [N, K]
-        else:
-            shape_Y = [K, N]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-    return shape_X, shape_Y
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size, 1))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = list(range(len(X.shape)))
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((1, Y.size))
-        else:
-            dim = list(range(len(Y.shape)))
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    return Out
-
-
-class Generator:
-    def setUp(self):
-        self.op_type = "matmul"
-        X = np.random.random(self.shape_X).astype("float32")
-        Y = np.random.random(self.shape_Y).astype("float32")
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
-        self.inputs = {'X': X, 'Y': Y}
-        self.attrs = {
-            'transpose_X': self.transpose_X,
-            'transpose_Y': self.transpose_Y,
-        }
-        self.outputs = {'Out': Out}
-
-    def test_check_output(self):
-        self.check_output(check_cinn=True)
-
-    def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            max_relative_error=1e-3,
-            check_cinn=True,
-        )
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            max_relative_error=1e-3,
-            no_grad_set=set("X"),
-            check_cinn=True,
-        )
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            max_relative_error=1e-3,
-            no_grad_set=set('Y'),
-            check_cinn=True,
-        )
-
-
-# Test case n-dim
-def generate_compatible_shapes_ndim(dim, transpose_X, transpose_Y):
-    M = 2
-    N = 4
-    K = 3
-    shape_X = [2 for _ in range(dim - 2)]
-    shape_Y = [2 for _ in range(dim - 2)]
-
-    if transpose_X:
-        shape_X += [K, M]
-    else:
-        shape_X += [M, K]
-
-    if transpose_Y:
-        shape_Y += [N, K]
-    else:
-        shape_Y += [K, N]
-
-    return shape_X, shape_Y
-
-
-# # Test case n-dim
-for dim in [4]:
-    for transpose_X in [False, True]:
-        for transpose_Y in [False, True]:
-            test_name = f'TestMatMulOp_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
-            shape_X, shape_Y = generate_compatible_shapes_ndim(
-                dim, transpose_X, transpose_Y
-            )
-            globals()[test_name] = type(
-                test_name,
-                (Generator, OpTest),
-                {
-                    'shape_X': shape_X,
-                    'shape_Y': shape_Y,
-                    'transpose_X': transpose_X,
-                    'transpose_Y': transpose_Y,
-                },
-            )
-
-
-class API_TestMm(unittest.TestCase):
-    @test_with_pir_api
-    def test_out(self):
-        with paddle_static_guard():
-            with paddle.base.program_guard(paddle.base.Program()):
-                x = paddle.static.data(name="x", shape=[2], dtype="float64")
-                y = paddle.static.data(name='y', shape=[2], dtype='float64')
-                result = paddle.mm(x, y)
-                exe = base.Executor(base.CPUPlace())
-                data1 = np.random.rand(2)
-                data2 = np.random.rand(2)
-                np_res = exe.run(
-                    feed={'x': data1, 'y': data2}, fetch_list=[result]
-                )
-                expected_result = np.matmul(data1, data2)
-
-            np.testing.assert_allclose(
-                np_res,
-                expected_result,
-                rtol=1e-05,
-                atol=1e-05,
-                err_msg=f'two value is            {np_res}\n{expected_result}, check diff!',
-            )
-
-    def test_dygraph_without_out(self):
-        device = base.CPUPlace()
-        with base.dygraph.guard(device):
-            input_array1 = np.random.rand(3, 4).astype("float64")
-            input_array2 = np.random.rand(4, 3).astype("float64")
-            data1 = paddle.to_tensor(input_array1)
-            data2 = paddle.to_tensor(input_array2)
-            out = paddle.mm(data1, data2)
-            expected_result = np.matmul(input_array1, input_array2)
-        np.testing.assert_allclose(expected_result, out.numpy(), rtol=1e-05)
-
-
-class Test_API_Matmul(unittest.TestCase):
-    def test_dygraph_without_out(self):
-        device = base.CPUPlace()
-        with base.dygraph.guard(device):
-            input_array1 = np.random.rand(3, 4).astype("float64")
-            input_array2 = np.random.rand(4, 3).astype("float64")
-            data1 = paddle.to_tensor(input_array1)
-            data2 = paddle.to_tensor(input_array2)
-            out = paddle.matmul(data1, data2)
-            expected_result = np.matmul(input_array1, input_array2)
-        np.testing.assert_allclose(expected_result, out.numpy(), rtol=1e-05)
-
-
-class API_TestMmError(unittest.TestCase):
-    @test_with_pir_api
-    def test_errors(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-
-            def test_error1():
-                data1 = paddle.static.data(
-                    name="data1", shape=[10, 2], dtype="float32"
-                )
-                data2 = paddle.static.data(
-                    name="data2", shape=[3, 10], dtype="float32"
-                )
-                paddle.mm(data1, data2)
-
-            self.assertRaises(ValueError, test_error1)
-
-            def test_error2():
-                data3 = paddle.static.data(
-                    name="data3", shape=[10, 10, 2], dtype="float32"
-                )
-                data4 = paddle.static.data(
-                    name="data4", shape=[3, 2, 10], dtype="float32"
-                )
-                paddle.mm(data3, data4)
-
-            self.assertRaises(ValueError, test_error2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_nearest_interp_op.py b/test/legacy_test/test_nearest_interp_op.py
deleted file mode 100755
index 3b09cab3eacee0..00000000000000
--- a/test/legacy_test/test_nearest_interp_op.py
+++ /dev/null
@@ -1,471 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-from paddle.base import core
-
-
-def nearest_neighbor_interp_np(
-    X,
-    out_h,
-    out_w,
-    out_size=None,
-    actual_shape=None,
-    align_corners=True,
-    data_layout='NCHW',
-):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        if align_corners:
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if align_corners:
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((n, c, out_h, out_w))
-
-    if align_corners:
-        for i in range(out_h):
-            in_i = int(ratio_h * i + 0.5)
-            for j in range(out_w):
-                in_j = int(ratio_w * j + 0.5)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-    else:
-        for i in range(out_h):
-            in_i = int(ratio_h * i)
-            for j in range(out_w):
-                in_j = int(ratio_w * j)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(X.dtype)
-
-
-class TestNearestInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        input_np = np.random.random(self.input_shape).astype("float64")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.data_layout,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'data_layout': self.data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NHWC"
-
-
-class TestNearestInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape
-        ).astype("uint8")
-
-        if self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_dygraph=False
-        )
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [1, 3, 9, 6]
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 80
-        self.out_w = 40
-        self.scale = 0.0
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.0
-        self.out_size = np.array([6, 15]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-class TestNearestInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.scale_by_2Dtensor = False
-        self.init_test_case()
-        self.op_type = "nearest_interp_v2"
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float64")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        elif self.scale_by_2Dtensor:
-            self.inputs['Scale'] = np.array(self.scale).astype("float32")
-            out_h = int(self.input_shape[2] * self.scale[0])
-            out_w = int(self.input_shape[3] * self.scale[1])
-        elif self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(
-                    ("x" + str(index), np.ones(1).astype('int32') * ele)
-                )
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = nearest_neighbor_interp_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-        )
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.0
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-# scale is a 2-D tensor
-class TestNearestInterp_attr_tensor_Case4(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 2.0]
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_2Dtensor = True
-
-
-if __name__ == "__main__":
-    import paddle
-
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/legacy_test/test_trilinear_interp_op.py b/test/legacy_test/test_trilinear_interp_op.py
deleted file mode 100755
index d83c1987e700c6..00000000000000
--- a/test/legacy_test/test_trilinear_interp_op.py
+++ /dev/null
@@ -1,613 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-from paddle.base import core
-
-
-def trilinear_interp_np(
-    input,
-    out_d,
-    out_h,
-    out_w,
-    out_size=None,
-    actual_shape=None,
-    align_corners=True,
-    align_mode=0,
-    data_layout='NCDHW',
-):
-    """trilinear interpolation implement in shape [N, C, D, H, W]"""
-    if data_layout == "NDHWC":
-        input = np.transpose(input, (0, 4, 1, 2, 3))  # NDHWC => NCDHW
-    if out_size is not None:
-        out_d = out_size[0]
-        out_h = out_size[1]
-        out_w = out_size[2]
-    if actual_shape is not None:
-        out_d = actual_shape[0]
-        out_h = actual_shape[1]
-        out_w = actual_shape[2]
-    batch_size, channel, in_d, in_h, in_w = input.shape
-
-    ratio_d = ratio_h = ratio_w = 0.0
-    if out_d > 1:
-        if align_corners:
-            ratio_d = (in_d - 1.0) / (out_d - 1.0)
-        else:
-            ratio_d = 1.0 * in_d / out_d
-    if out_h > 1:
-        if align_corners:
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if align_corners:
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_d, out_h, out_w))
-
-    for i in range(out_d):
-        if align_mode == 0 and not align_corners:
-            d = int(ratio_d * (i + 0.5) - 0.5)
-        else:
-            d = int(ratio_d * i)
-
-        d = max(0, d)
-        did = 1 if d < in_d - 1 else 0
-        if align_mode == 0 and not align_corners:
-            idx_src_d = max(ratio_d * (i + 0.5) - 0.5, 0)
-            d1lambda = idx_src_d - d
-        else:
-            d1lambda = ratio_d * i - d
-        d2lambda = 1.0 - d1lambda
-
-        for j in range(out_h):
-            if align_mode == 0 and not align_corners:
-                h = int(ratio_h * (j + 0.5) - 0.5)
-            else:
-                h = int(ratio_h * j)
-
-            h = max(0, h)
-            hid = 1 if h < in_h - 1 else 0
-            if align_mode == 0 and not align_corners:
-                idx_src_h = max(ratio_h * (j + 0.5) - 0.5, 0)
-                h1lambda = idx_src_h - h
-            else:
-                h1lambda = ratio_h * j - h
-            h2lambda = 1.0 - h1lambda
-
-            for k in range(out_w):
-                if align_mode == 0 and not align_corners:
-                    w = int(ratio_w * (k + 0.5) - 0.5)
-                else:
-                    w = int(ratio_w * k)
-                w = max(0, w)
-                wid = 1 if w < in_w - 1 else 0
-                if align_mode == 0 and not align_corners:
-                    idx_src_w = max(ratio_w * (k + 0.5) - 0.5, 0)
-                    w1lambda = idx_src_w - w
-                else:
-                    w1lambda = ratio_w * k - w
-                w2lambda = 1.0 - w1lambda
-
-                out[:, :, i, j, k] = d2lambda * (
-                    h2lambda
-                    * (
-                        w2lambda * input[:, :, d, h, w]
-                        + w1lambda * input[:, :, d, h, w + wid]
-                    )
-                    + h1lambda
-                    * (
-                        w2lambda * input[:, :, d, h + hid, w]
-                        + w1lambda * input[:, :, d, h + hid, w + wid]
-                    )
-                ) + d1lambda * (
-                    h2lambda
-                    * (
-                        w2lambda * input[:, :, d + did, h, w]
-                        + w1lambda * input[:, :, d + did, h, w + wid]
-                    )
-                    + h1lambda
-                    * (
-                        w2lambda * input[:, :, d + did, h + hid, w]
-                        + w1lambda * input[:, :, d + did, h + hid, w + wid]
-                    )
-                )
-    if data_layout == "NDHWC":
-        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC
-
-    return out.astype(input.dtype)
-
-
-class TestTrilinearInterpOp(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCDHW'
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        # NOTE(dev): some AsDispensible input is not used under imperative mode.
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCDHW":
-            in_d = self.input_shape[2]
-            in_h = self.input_shape[3]
-            in_w = self.input_shape[4]
-        else:
-            in_d = self.input_shape[1]
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-
-        if self.scale > 0:
-            out_d = int(in_d * self.scale)
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(
-            input_np,
-            out_d,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-            self.data_layout,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        # c++ end treat NCDHW the same way as NCHW
-        if self.data_layout == 'NCDHW':
-            data_layout = 'NCHW'
-        else:
-            data_layout = 'NHWC'
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': data_layout,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        # NODE(yjjiang11): This op will be deprecated.
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 4, 4, 4]
-        self.out_d = 2
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 32
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase4(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 1
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.out_size = np.array([2, 2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase5(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 3, 9, 6, 8]
-        self.out_d = 12
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = np.array([11, 11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase6(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.0
-        self.out_size = np.array([17, 9, 5]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSame(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 16
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpSameHW(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 1, 16, 8, 4]
-        self.out_d = 8
-        self.out_h = 8
-        self.out_w = 4
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpActualShape(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 16, 8, 4]
-        self.out_d = 64
-        self.out_h = 32
-        self.out_w = 16
-        self.scale = 0.0
-        self.out_size = np.array([33, 19, 7]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpDatalayout(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 4, 4, 4, 3]
-        self.out_d = 2
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.0
-        self.out_size = np.array([3, 3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-        self.data_layout = "NDHWC"
-
-
-class TestTrilinearInterpOpUint8(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        input_np = np.random.randint(
-            low=0, high=256, size=self.input_shape
-        ).astype("uint8")
-
-        if self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = trilinear_interp_np(
-            input_np,
-            out_d,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-        )
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-
-        self.attrs = {
-            'out_d': self.out_d,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output_with_place(
-            place=core.CPUPlace(), atol=1, check_dygraph=False
-        )
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [1, 3, 9, 6, 8]
-        self.out_d = 13
-        self.out_h = 10
-        self.out_w = 9
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase1Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 16, 8, 4]
-        self.out_d = 13
-        self.out_h = 7
-        self.out_w = 2
-        self.scale = 0.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpCase2Uint8(TestTrilinearInterpOpUint8):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [4, 1, 7, 8, 9]
-        self.out_d = 3
-        self.out_h = 5
-        self.out_w = 13
-        self.scale = 0.0
-        self.out_size = np.array([6, 15, 21]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpOtherMethod1(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-class TestTrilinearInterpWithMethod2(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestTrilinearInterpWithMethod3(TestTrilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-class TestTrilinearInterpScale1(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 82
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale2(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 60
-        self.out_h = 40
-        self.out_w = 25
-        self.scale = 1.0
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpScale3(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 9]
-        self.out_d = 60
-        self.out_h = 40
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-class TestTrilinearInterpZero(TestTrilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 5, 7, 11]
-        self.out_d = 60
-        self.out_h = 40
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-class TestTrilinearInterpOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "trilinear_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_d = int(self.input_shape[2] * self.scale)
-            out_h = int(self.input_shape[3] * self.scale)
-            out_w = int(self.input_shape[4] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_d = self.out_d
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(
-                    ("x" + str(index), np.ones(1).astype('int32') * ele)
-                )
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_d'] = self.out_d
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = trilinear_interp_np(
-            input_np,
-            out_d,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.align_corners,
-            self.align_mode,
-        )
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', in_place=True, check_dygraph=False)
-
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 4, 4, 4]
-        self.out_d = 2
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.0
-        self.out_size = [2, 3, 3]
-        self.align_corners = True
-        self.align_mode = 1
-
-
-# out_size is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case1(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [3, 2, 9, 6, 8]
-        self.out_d = 32
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 0.3
-        self.out_size = [12, 4, 4]
-        self.align_corners = True
-        self.align_mode = 1
-
-
-# scale is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case2(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 8, 8, 4]
-        self.out_d = 16
-        self.out_h = 12
-        self.out_w = 4
-        self.scale = 0.0
-        self.out_size = [16, 4, 10]
-        self.align_corners = True
-        self.align_mode = 1
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-class TestTrilinearInterp_attr_tensor_Case3(TestTrilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'trilinear'
-        self.input_shape = [2, 3, 8, 8, 4]
-        self.out_d = 16
-        self.out_h = 16
-        self.out_w = 8
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.align_mode = 1
-        self.scale_by_1Dtensor = True
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/mkldnn/test_bilinear_interp_mkldnn_op.py b/test/mkldnn/test_bilinear_interp_mkldnn_op.py
deleted file mode 100644
index 023b07d9ef4679..00000000000000
--- a/test/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import unittest
-
-import numpy as np
-from op_test import OpTest, skip_check_grad_ci
-
-
-def bilinear_interp_mkldnn_np(
-    input, out_h, out_w, out_size=None, actual_shape=None, data_layout='NCHW'
-):
-    """bilinear interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    batch_size, channel, in_h, in_w = input.shape
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-
-    for oh in range(out_h):
-        h0 = int(math.floor((oh + 0.5) * in_h / out_h - 0.5))
-        h1 = int(math.ceil((oh + 0.5) * in_h / out_h - 0.5))
-        h0 = max(h0, 0)
-        h1 = min(h1, in_h - 1)
-        Wh = (oh + 0.5) * in_h / out_h - 0.5 - h0
-        for ow in range(out_w):
-            w0 = int(math.floor((ow + 0.5) * in_w / out_w - 0.5))
-            w1 = int(math.ceil((ow + 0.5) * in_w / out_w - 0.5))
-            w0 = max(w0, 0)
-            w1 = min(w1, in_w - 1)
-            Ww = (ow + 0.5) * in_w / out_w - 0.5 - w0
-            input_h0_w0 = input[:, :, h0, w0]
-            input_h1_w0 = input[:, :, h1, w0]
-            input_h0_w1 = input[:, :, h0, w1]
-            input_h1_w1 = input[:, :, h1, w1]
-            out[:, :, oh, ow] = (
-                input_h0_w0 * (1 - Wh) * (1 - Ww)
-                + input_h1_w0 * Wh * (1 - Ww)
-                + input_h0_w1 * (1 - Wh) * Ww
-                + input_h1_w1 * Wh * Ww
-            )
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(input.dtype)
-
-
-@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
-class TestBilinearInterpMKLDNNOp(OpTest):
-    def init_test_case(self):
-        pass
-
-    def setUp(self):
-        self.op_type = "bilinear_interp"
-        self.interp_method = 'bilinear'
-        self._cpu_only = True
-        self.use_mkldnn = True
-        self.input_shape = [1, 1, 2, 2]
-        self.data_layout = 'NCHW'
-        # priority: actual_shape > out_size > scale > out_h & out_w
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 2.0
-        self.out_size = None
-        self.actual_shape = None
-
-        self.init_test_case()
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_mkldnn_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.data_layout,
-        )
-
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'data_layout': self.data_layout,
-            'use_mkldnn': self.use_mkldnn,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-
-class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 27
-        self.out_w = 49
-        self.scale = 2.0
-        self.data_layout = 'NHWC'
-
-
-class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 1.0
-
-
-class TestBilinearNeighborInterpDataLayout(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 6
-        self.out_w = 7
-        self.scale = 0.0
-        self.data_layout = "NHWC"
-
-
-class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.0
-
-
-class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.0
-        self.out_size = np.array([2, 2]).astype("int32")
-
-
-class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.0
-        self.out_size = np.array([13, 13]).astype("int32")
-
-
-class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.0
-
-
-if __name__ == "__main__":
-    from paddle import enable_static
-
-    enable_static()
-    unittest.main()
diff --git a/test/mkldnn/test_matmul_mkldnn_op.py b/test/mkldnn/test_matmul_mkldnn_op.py
deleted file mode 100644
index 85a6d79de97592..00000000000000
--- a/test/mkldnn/test_matmul_mkldnn_op.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestDnnlMatMulOp(OpTest):
-    def generate_data(self):
-        self.x = np.random.random((25, 2, 2)).astype("float32")
-        self.y = np.random.random((25, 2, 2)).astype("float32")
-        self.alpha = 1.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
-
-    def set_attributes(self):
-        self.alpha = self.alpha if hasattr(self, 'alpha') else 1.0
-        self.attrs = {'alpha': self.alpha}
-
-    def setUp(self):
-        # Set max isa, otherwise fails on SKX and earlier
-        os.environ["DNNL_MAX_CPU_ISA"] = "AVX"
-        self.op_type = "matmul"
-        self._cpu_only = True
-        self.use_mkldnn = True
-        self.generate_data()
-        self.set_attributes()
-        self.attrs['use_mkldnn'] = True
-
-        self.inputs = {'X': self.x, 'Y': self.y}
-        self.outputs = {'Out': self.out}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestDnnlMatMulWithGradOp(TestDnnlMatMulOp):
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
-
-
-class TestDnnlMatMulOpMixedDims1(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((17, 2, 3)).astype("float32")
-        self.y = np.random.random((3, 4)).astype("float32")
-        self.out = np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOpMixedDimsYWiderTransposeY(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((8, 2, 3)).astype("float32")
-        self.y = np.random.random((4, 3)).astype("float32")
-        self.out = np.matmul(self.x, np.transpose(self.y))
-
-    def set_attributes(self):
-        self.attrs = {'transpose_Y': True}
-
-
-class TestDnnlMatMulOpMixedDimsYWiderTransposeX(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((8, 3, 2)).astype("float32")
-        self.y = np.random.random((3, 4)).astype("float32")
-        self.out = np.matmul(np.transpose(self.x, (0, 2, 1)), self.y)
-
-    def set_attributes(self):
-        self.attrs = {'transpose_X': True}
-
-
-class TestDnnlMatMulOpMixedDimsXWiderTransposeXY(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((8, 3, 2)).astype("float32")
-        self.y = np.random.random((4, 3)).astype("float32")
-        self.out = np.matmul(
-            np.transpose(self.x, (0, 2, 1)), np.transpose(self.y)
-        )
-
-    def set_attributes(self):
-        self.attrs = {'transpose_X': True, 'transpose_Y': True}
-
-
-class TestDnnlMatMulOpMixedDimsYWiderTransposeXY(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((3, 2)).astype("float32")
-        self.y = np.random.random((8, 4, 3)).astype("float32")
-        self.out = np.matmul(
-            np.transpose(self.x), np.transpose(self.y, (0, 2, 1))
-        )
-
-    def set_attributes(self):
-        self.attrs = {'transpose_X': True, 'transpose_Y': True}
-
-
-class TestDnnlMatMulOpMixedDimsXWiderTransposeX(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((5, 4)).astype("float32")
-        self.y = np.random.random((8, 5, 4)).astype("float32")
-        self.out = np.matmul(np.transpose(self.x), self.y)
-
-    def set_attributes(self):
-        self.attrs = {'transpose_X': True}
-
-
-class TestDnnlMatMulOpVectorMultiply(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random(5).astype("float32")
-        self.y = np.random.random(5).astype("float32")
-        self.out = np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOpVectorMultiplyTranspose(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random(5).astype("float32")
-        x_resized = np.copy(self.x)
-        x_resized = np.expand_dims(x_resized, 1)
-        self.y = np.random.random(6).astype("float32")
-        y_resized = np.copy(self.y)
-        y_resized = np.expand_dims(y_resized, 0)
-        self.out = np.matmul(x_resized, y_resized)
-
-    def set_attributes(self):
-        self.attrs = {'transpose_Y': True, 'transpose_X': True}
-
-
-class TestDnnlMatMulOpMixedDims2(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((2, 3)).astype("float32")
-        self.y = np.random.random((17, 3, 4)).astype("float32")
-        self.out = np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOpAlpha(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((17, 2, 3)).astype("float32")
-        self.y = np.random.random((17, 3, 2)).astype("float32")
-        self.alpha = 2.0
-        self.out = self.alpha * np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOp2D(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((12, 9)).astype("float32")
-        self.y = np.random.random((9, 12)).astype("float32")
-        self.out = np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOpTransposeX(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((12, 9)).astype("float32")
-        self.y = np.random.random((12, 9)).astype("float32")
-        self.out = np.matmul(np.transpose(self.x), self.y)
-
-    def set_attributes(self):
-        self.attrs = {'transpose_X': True}
-
-
-class TestDnnlMatMulOpTransposeY(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((12, 9)).astype("float32")
-        self.y = np.random.random((12, 9)).astype("float32")
-        self.out = np.matmul(self.x, np.transpose(self.y))
-
-    def set_attributes(self):
-        self.attrs = {'transpose_Y': True}
-
-
-class TestDnnlMatMulOpTransposeY3D(TestDnnlMatMulWithGradOp):
-    def generate_data(self):
-        self.x = np.random.random((17, 3, 2)).astype("float32")
-        self.y = np.random.random((17, 3, 2)).astype("float32")
-        self.out = np.matmul(self.x, np.transpose(self.y, (0, 2, 1)))
-
-    def set_attributes(self):
-        self.attrs = {'transpose_Y': True}
-
-
-class TestDnnlMatMulOpInt8NoScales(TestDnnlMatMulOp):
-    def generate_data(self):
-        self.x = np.random.random((12, 9)).astype("int8")
-        self.y = np.random.random((9, 12)).astype("int8")
-        self.out = np.matmul(self.x, self.y)
-
-
-class TestDnnlMatMulOpInt8(TestDnnlMatMulOp):
-    # Due to limitation in int8 matmul implementation
-    # on older platforms (BDW, SKX) we needed to reduce
-    # range from [-127, 127] to [-63, 63]
-    def quantize(self, tensor):
-        scale = 63.0 / np.abs(np.amax(tensor))
-        quantized = np.round(scale * tensor).astype("int8")
-        return scale, quantized
-
-    def generate_data(self):
-        x_float = np.random.random((12, 9)).astype("float32")
-        self.x_scale, self.x = self.quantize(x_float)
-
-        y_float = np.random.random((9, 12)).astype("float32")
-        self.y_scale, self.y = self.quantize(y_float)
-
-        out_float = np.matmul(x_float, y_float)
-        self.out_scale, self.out = self.quantize(out_float)
-
-    def set_attributes(self):
-        self.attrs = {
-            'Scale_x': self.x_scale,
-            'Scale_y': self.y_scale,
-            'Scale_out': self.out_scale,
-        }
-
-    def test_check_output(self):
-        int_atol = 1
-        self.check_output(atol=int_atol)
-
-
-class TestDnnlMatMulOpInt8ForceFP32(TestDnnlMatMulOpInt8):
-    def generate_data(self):
-        x_float = np.random.random((12, 9)).astype("float32")
-        self.x_scale, self.x = self.quantize(x_float)
-
-        y_float = np.random.random((9, 12)).astype("float32")
-        self.y_scale, self.y = self.quantize(y_float)
-
-        out_float = np.matmul(x_float, y_float)
-        self.out = out_float
-
-    def set_attributes(self):
-        self.attrs = {
-            'Scale_x': self.x_scale,
-            'Scale_y': self.y_scale,
-            'force_fp32_output': True,
-        }
-
-
-class TestDnnlMatMulOpInt8ForceFP32BasicScales(TestDnnlMatMulOp):
-    def generate_data(self):
-        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.y = np.random.randint(0, 3, (9, 12)).astype("int8")
-        self.out = np.matmul(self.x, self.y).astype("float32")
-
-    def set_attributes(self):
-        self.attrs = {'force_fp32_output': True}
-
-
-if __name__ == "__main__":
-    from paddle import enable_static
-
-    enable_static()
-    unittest.main()
diff --git a/test/mkldnn/test_nearest_interp_mkldnn_op.py b/test/mkldnn/test_nearest_interp_mkldnn_op.py
deleted file mode 100644
index 1e07a605688247..00000000000000
--- a/test/mkldnn/test_nearest_interp_mkldnn_op.py
+++ /dev/null
@@ -1,203 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, skip_check_grad_ci
-
-
-def nearest_neighbor_interp_mkldnn_np(
-    X, out_h, out_w, out_size=None, actual_shape=None, data_layout='NCHW'
-):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-
-    n, c, in_h, in_w = X.shape
-
-    fh = fw = 0.0
-    if out_h > 1:
-        fh = out_h * 1.0 / in_h
-    if out_w > 1:
-        fw = out_w * 1.0 / in_w
-
-    out = np.zeros((n, c, out_h, out_w))
-
-    for oh in range(out_h):
-        ih = int(round((oh + 0.5) / fh - 0.5))
-        for ow in range(out_w):
-            iw = int(round((ow + 0.5) / fw - 0.5))
-            out[:, :, oh, ow] = X[:, :, ih, iw]
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(X.dtype)
-
-
-@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
-class TestNearestInterpMKLDNNOp(OpTest):
-    def init_test_case(self):
-        pass
-
-    def init_data_type(self):
-        self.dtype = np.float32
-
-    def setUp(self):
-        self.op_type = "nearest_interp"
-        self.interp_method = 'nearest'
-        self._cpu_only = True
-        self.use_mkldnn = True
-        self.input_shape = [1, 1, 2, 2]
-        self.data_layout = 'NCHW'
-        # priority: actual_shape > out_size > scale > out_h & out_w
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 2.0
-        self.out_size = None
-        self.actual_shape = None
-
-        self.init_test_case()
-        self.init_data_type()
-
-        if self.dtype == np.float32:
-            input_np = np.random.random(self.input_shape).astype(self.dtype)
-        else:
-            init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10)
-            input_np = np.random.randint(
-                init_low, init_high, self.input_shape
-            ).astype(self.dtype)
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_mkldnn_np(
-            input_np,
-            out_h,
-            out_w,
-            self.out_size,
-            self.actual_shape,
-            self.data_layout,
-        )
-
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'data_layout': self.data_layout,
-            'use_mkldnn': self.use_mkldnn,
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-
-class TestNearestInterpOpMKLDNNNHWC(TestNearestInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 27
-        self.out_w = 49
-        self.scale = 2.0
-        self.data_layout = 'NHWC'
-
-
-class TestNearestNeighborInterpMKLDNNCase2(TestNearestInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 1.0
-
-
-class TestNearestNeighborInterpCase3(TestNearestInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 128
-        self.scale = 0.0
-
-
-class TestNearestNeighborInterpCase4(TestNearestInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.0
-        self.out_size = np.array([65, 129]).astype("int32")
-
-
-class TestNearestNeighborInterpSame(TestNearestInterpMKLDNNOp):
-    def init_test_case(self):
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.0
-
-
-def create_test_class(parent):
-    '''
-    Create tests for int, uint8. By default parent class works on fp32.
-    '''
-
-    class TestInt8Case(parent):
-        def init_data_type(self):
-            self.dtype = np.int8
-
-    class TestUint8Case(parent):
-        def init_data_type(self):
-            self.dtype = np.uint8
-
-    TestInt8Case.__name__ = "{}_{}".format(parent.__name__, "INT8")
-    TestUint8Case.__name__ = "{}_{}".format(parent.__name__, "UINT8")
-    globals()[TestInt8Case.__name__] = TestInt8Case
-    globals()[TestUint8Case.__name__] = TestUint8Case
-
-
-create_test_class(TestNearestInterpMKLDNNOp)
-create_test_class(TestNearestInterpOpMKLDNNNHWC)
-create_test_class(TestNearestNeighborInterpMKLDNNCase2)
-create_test_class(TestNearestNeighborInterpCase3)
-create_test_class(TestNearestNeighborInterpCase4)
-create_test_class(TestNearestNeighborInterpSame)
-
-if __name__ == "__main__":
-    from paddle import enable_static
-
-    enable_static()
-    unittest.main()
diff --git a/test/xpu/CMakeLists.txt b/test/xpu/CMakeLists.txt
index 4b269e60cfa12d..ad2de316465cf5 100644
--- a/test/xpu/CMakeLists.txt
+++ b/test/xpu/CMakeLists.txt
@@ -32,7 +32,6 @@ endforeach()
 set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
-set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
 set_tests_properties(test_collective_identity_xpu
                      PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
 set_tests_properties(test_collective_allgather_xpu
diff --git a/test/xpu/test_bilinear_interp_op_xpu.py b/test/xpu/test_bilinear_interp_op_xpu.py
deleted file mode 100755
index 6c08731d3b01d9..00000000000000
--- a/test/xpu/test_bilinear_interp_op_xpu.py
+++ /dev/null
@@ -1,508 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-paddle.enable_static()
-'''
-def bilinear_interp_np(input,
-                       out_h,
-                       out_w,
-                       out_size=None,
-                       actual_shape=None,
-                       align_corners=True,
-                       align_mode=0,
-                       data_layout='NCHW'):
-    """bilinear interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    batch_size, channel, in_h, in_w = input.shape
-
-    ratio_h = ratio_w = 0.0
-    if out_h > 1:
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if out_w > 1:
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((batch_size, channel, out_h, out_w))
-
-    for i in range(out_h):
-        if (align_mode == 0 and not align_corners):
-            h = int(ratio_h * (i + 0.5) - 0.5)
-        else:
-            h = int(ratio_h * i)
-
-        h = max(0, h)
-        hid = 1 if h < in_h - 1 else 0
-        if (align_mode == 0 and not align_corners):
-            idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0)
-            h1lambda = idx_src_h - h
-        else:
-            h1lambda = ratio_h * i - h
-        h2lambda = 1.0 - h1lambda
-        for j in range(out_w):
-            if (align_mode == 0 and not align_corners):
-                w = int(ratio_w * (j + 0.5) - 0.5)
-            else:
-                w = int(ratio_w * j)
-            w = max(0, w)
-            wid = 1 if w < in_w - 1 else 0
-            if (align_mode == 0 and not align_corners):
-                idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0)
-                w1lambda = idx_src_w - w
-            else:
-                w1lambda = ratio_w * j - w
-            w2lambda = 1.0 - w1lambda
-
-            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
-                                        w1lambda*input[:, :, h, w+wid]) + \
-                h1lambda*(w2lambda*input[:, :, h+hid, w] +
-                          w1lambda*input[:, :, h+hid, w+wid])
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(input.dtype)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = 'NCHW'
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode, self.data_layout)
-        self.inputs = {'X': input_np}
-        if self.out_size is not None:
-            self.inputs['OutSize'] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs['OutSize'] = self.actual_shape
-
-        self.attrs = {
-            'out_h': self.out_h,
-            'out_w': self.out_w,
-            'scale': self.scale,
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-            'align_mode': self.align_mode,
-            'data_layout': self.data_layout
-        }
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase4(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase5(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpCase6(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 33]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpSame(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpActualShape(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpDataLayout(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 5, 5, 3]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-        self.align_mode = 1
-        self.data_layout = "NHWC"
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = False
-        self.align_mode = 0
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
-    def set_align_mode(self):
-        self.align_corners = True
-        self.align_mode = 0
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpScale1(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 2.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpScale2(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpScale3(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 1.5
-        self.align_corners = True
-        self.align_mode = 1
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpZero(TestBilinearInterpOp):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 7]
-        self.out_h = 60
-        self.out_w = 25
-        self.scale = 0.2
-        self.align_corners = False
-        self.align_mode = 0
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.init_test_case()
-        self.op_type = "bilinear_interp"
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.attrs = {
-            'interp_method': self.interp_method,
-            'align_corners': self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {'X': input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs['Scale'] = np.array([self.scale]).astype("float32")
-        elif self.scale > 0:
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-            self.attrs['scale'] = self.scale
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs['OutSize'] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype('int32') * ele))
-            self.inputs['SizeTensor'] = size_tensor
-
-        self.attrs['out_h'] = self.out_h
-        self.attrs['out_w'] = self.out_w
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners)
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [2, 3, 5, 5]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# scale is a 1-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'bilinear'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestBilinearInterpOpAPI(unittest.TestCase):
-    def test_case(self):
-        x = paddle.static.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
-
-        dim = paddle.static.data(name="dim", shape=[1], dtype="int32")
-        shape_tensor = paddle.static.data(name="shape_tensor", shape=[2], dtype="int32")
-        actual_size = paddle.static.data(name="actual_size", shape=[2], dtype="int32")
-        scale_tensor = paddle.static.data(
-            name="scale_tensor", shape=[1], dtype="float32")
-
-        out1 = base.layers.resize_bilinear(x, out_shape=[12, 12])
-        out2 = base.layers.resize_bilinear(x, out_shape=[12, dim])
-        out3 = base.layers.resize_bilinear(x, out_shape=shape_tensor)
-        out4 = base.layers.resize_bilinear(
-            x, out_shape=[4, 4], actual_shape=actual_size)
-        out5 = base.layers.resize_bilinear(x, scale=scale_tensor)
-
-        x_data = np.random.random((2, 3, 6, 6)).astype("float32")
-        dim_data = np.array([12]).astype("int32")
-        shape_data = np.array([12, 12]).astype("int32")
-        actual_size_data = np.array([12, 12]).astype("int32")
-        scale_data = np.array([2.0]).astype("float32")
-
-        place = core.XPUPlace(0)
-        exe = base.Executor(place)
-        exe.run(base.default_startup_program())
-        results = exe.run(base.default_main_program(),
-                          feed={
-                              "x": x_data,
-                              "dim": dim_data,
-                              "shape_tensor": shape_data,
-                              "actual_size": actual_size_data,
-                              "scale_tensor": scale_data
-                          },
-                          fetch_list=[out1, out2, out3, out4, out5],
-                          return_numpy=True)
-
-        expect_res = bilinear_interp_np(
-            x_data, out_h=12, out_w=12, align_corners=True)
-        for res in results:
-            np.testing.assert_allclose(res, expect_res)
-'''
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/xpu/test_matmul_op_xpu.py b/test/xpu/test_matmul_op_xpu.py
deleted file mode 100644
index bc944b2608c045..00000000000000
--- a/test/xpu/test_matmul_op_xpu.py
+++ /dev/null
@@ -1,387 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from get_test_cover_info import (
-    XPUOpTestWrapper,
-    create_test_class,
-    get_xpu_op_support_types,
-)
-from op_test_xpu import XPUOpTest
-
-import paddle
-from paddle import base
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size, 1))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = list(range(len(X.shape)))
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((1, Y.size))
-        elif Y.ndim == 2:
-            Y = Y.T
-        else:
-            dim = list(range(len(Y.shape)))
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    if X.ndim == 3 and Y.ndim == 2:
-        x_dims = X.shape
-        X = X.reshape((x_dims[0] * x_dims[1], x_dims[2]))
-    if Y.ndim == 3 and X.ndim == 2:
-        y_dims = Y.shape
-        Y = Y.reshape((y_dims[0] * y_dims[1], y_dims[2]))
-    Out = np.matmul(X, Y)
-    return Out
-
-
-def generate_compatible_shapes(
-    dim_X, dim_Y, transpose_X, transpose_Y, batch_size
-):
-    BATCH_SIZE = 2
-    if batch_size is not None:
-        BATCH_SIZE = batch_size
-
-    M = 3
-    N = 4
-    K = 5
-    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
-        K = 1
-    if dim_X == 1:
-        if transpose_X:
-            shape_X = [M]
-        else:
-            shape_X = [K]
-    if dim_Y == 1:
-        if transpose_Y:
-            shape_Y = [N]
-        else:
-            shape_Y = [K]
-    if dim_X >= 2:
-        if transpose_X:
-            shape_X = [K, M]
-        else:
-            shape_X = [M, K]
-    if dim_X == 3:
-        shape_X = [BATCH_SIZE] + shape_X
-    if dim_Y >= 2:
-        if transpose_Y:
-            shape_Y = [N, K]
-        else:
-            shape_Y = [K, N]
-    if dim_Y == 3:
-        shape_Y = [BATCH_SIZE] + shape_Y
-
-    if dim_Y == 3 and dim_X == 2:
-        if not transpose_X:
-            shape_X[1] = shape_X[1] * BATCH_SIZE
-        else:
-            shape_X[0] = shape_X[0] * BATCH_SIZE
-
-    return shape_X, shape_Y
-
-
-def generate_compatible_shapes_2(dim, transpose_X, transpose_Y):
-    M = 2
-    N = 4
-    K = 3
-    shape_X = [2 for _ in range(dim - 2)]
-    shape_Y = [2 for _ in range(dim - 2)]
-
-    if transpose_X:
-        shape_X += [K, M]
-    else:
-        shape_X += [M, K]
-
-    if transpose_Y:
-        shape_Y += [N, K]
-    else:
-        shape_Y += [K, N]
-
-    return shape_X, shape_Y
-
-
-class XPUTestMatmulOpErr(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "matmul"
-        self.use_dynamic_create_class = False
-
-    class API_TestMm(unittest.TestCase):
-        def test_out(self):
-            with base.program_guard(base.Program()):
-                x = paddle.static.data(name="x", shape=[2], dtype=self.in_type)
-                y = paddle.static.data(name='y', shape=[2], dtype=self.in_type)
-                result = paddle.mm(x, y)
-                exe = base.Executor(base.XPUPlace(0))
-                data1 = np.random.rand(2).astype(self.in_type)
-                data2 = np.random.rand(2).astype(self.in_type)
-                np_res = exe.run(
-                    feed={'x': data1, 'y': data2}, fetch_list=[result]
-                )
-                expected_result = np.matmul(data1, data2)
-
-                np.testing.assert_allclose(np_res, expected_result, atol=1e-3)
-
-        def test_dygraph_without_out(self):
-            device = base.XPUPlace(0)
-            with base.dygraph.guard(device):
-                input_array1 = np.random.rand(3, 4).astype(self.in_type)
-                input_array2 = np.random.rand(4, 3).astype(self.in_type)
-                data1 = paddle.to_tensor(input_array1)
-                data2 = paddle.to_tensor(input_array2)
-                out = paddle.mm(data1, data2)
-                expected_result = np.matmul(input_array1, input_array2)
-                np.testing.assert_allclose(
-                    expected_result, out.numpy(), atol=1e-3
-                )
-
-    class Test_API_Matmul(unittest.TestCase):
-        def test_dygraph_without_out(self):
-            device = base.XPUPlace(0)
-            with base.dygraph.guard(device):
-                input_array1 = np.random.rand(3, 4).astype(self.in_type)
-                input_array2 = np.random.rand(4, 3).astype(self.in_type)
-                data1 = paddle.to_tensor(input_array1).astype(self.in_type)
-                data2 = paddle.to_tensor(input_array2).astype(self.in_type)
-                out = paddle.matmul(data1, data2)
-                expected_result = np.matmul(input_array1, input_array2)
-                np.testing.assert_allclose(
-                    expected_result, out.numpy(), atol=1e-3
-                )
-
-    class API_TestMmError(unittest.TestCase):
-        def test_errors(self):
-            def test_error1():
-                with base.program_guard(base.Program(), base.Program()):
-                    data1 = paddle.static.data(
-                        name="data1", shape=[10, 2], dtype="float32"
-                    )
-                    data2 = paddle.static.data(
-                        name="data2", shape=[3, 10], dtype="float32"
-                    )
-                    paddle.mm(data1, data2)
-
-            self.assertRaises(ValueError, test_error1)
-
-            def test_error2():
-                with base.program_guard(base.Program(), base.Program()):
-                    data1 = paddle.static.data(
-                        name="data1", shape=[-1, 10, 2], dtype="float32"
-                    )
-                    data2 = paddle.static.data(
-                        name="data2", shape=[-1, 2, 10], dtype="float32"
-                    )
-                    paddle.mm(data1, data2)
-
-            test_error2()
-
-            def test_error3():
-                with base.program_guard(base.Program(), base.Program()):
-                    data1 = paddle.static.data(
-                        name="data1", shape=[10, 10, 2], dtype="float32"
-                    )
-                    data2 = paddle.static.data(
-                        name="data2", shape=[3, 2, 10], dtype="float32"
-                    )
-                    paddle.mm(data1, data2)
-
-            self.assertRaises(ValueError, test_error3)
-
-
-class TestMatmulBaseGenerator(XPUOpTest):
-    def setUp(self):
-        self.op_type = "matmul"
-        self.dtype = (
-            np.float32 if not hasattr(self, 'in_type') else self.in_type
-        )
-
-        self.__class__.no_need_check_grad = (
-            False
-            if not hasattr(self, 'no_need_check_grad')
-            else self.no_need_check_grad
-        )
-
-        shape_X = [4, 5] if not hasattr(self, 'shape_X') else self.shape_X
-        shape_Y = [5, 6] if not hasattr(self, 'shape_Y') else self.shape_Y
-        transpose_X = (
-            False if not hasattr(self, 'transpose_X') else self.transpose_X
-        )
-        transpose_Y = (
-            False if not hasattr(self, 'transpose_Y') else self.transpose_Y
-        )
-
-        X = np.random.random(shape_X).astype(self.dtype)
-        Y = np.random.random(shape_Y).astype(self.dtype)
-        Out = reference_matmul(X, Y, transpose_X, transpose_Y).astype(
-            self.dtype
-        )
-        self.inputs = {'X': X, 'Y': Y}
-        self.attrs = {'transpose_X': transpose_X, 'transpose_Y': transpose_Y}
-        self.outputs = {'Out': Out}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad_normal(self):
-        if (
-            hasattr(self.__class__, "no_need_check_grad")
-            and self.__class__.no_need_check_grad
-        ):
-            return
-
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', max_relative_error=5e-2
-        )
-
-    def test_check_grad_ignore_x(self):
-        if (
-            hasattr(self.__class__, "no_need_check_grad")
-            and self.__class__.no_need_check_grad
-        ):
-            return
-
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['Y'], 'Out', max_relative_error=5e-2, no_grad_set=set("X")
-        )
-
-    def test_check_grad_ignore_y(self):
-        if (
-            hasattr(self.__class__, "no_need_check_grad")
-            and self.__class__.no_need_check_grad
-        ):
-            return
-
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(
-            place, ['X'], 'Out', max_relative_error=5e-2, no_grad_set=set('Y')
-        )
-
-
-class XPUTestMatmulOp1(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "matmul"
-        self.use_dynamic_create_class = True
-
-    def dynamic_create_class(self):
-        base_class = TestMatmulBaseGenerator
-        classes = []
-        xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
-        batch_size = [2, 4, 5, 10, 50, 100, 300]
-        for dims in xpu_support_dims_list:
-            dim_X = dims[0]
-            dim_Y = dims[1]
-            for transpose_x in [True, False]:
-                for transpose_y in [True, False]:
-                    for batch in batch_size:
-                        no_need_check_grad = False
-                        if batch >= 5:
-                            no_need_check_grad = True
-                        class_name = f'TestMatMulOp_dimX_{dim_X}_dim_Y_{dim_Y}_transX_{transpose_x}_transY_{transpose_y}_batch_{batch}'
-                        shape_x, shape_y = generate_compatible_shapes(
-                            dim_X, dim_Y, transpose_x, transpose_y, batch
-                        )
-                        attr_dict = {
-                            'shape_X': shape_x,
-                            'shape_Y': shape_y,
-                            'transpose_X': transpose_x,
-                            'transpose_Y': transpose_y,
-                            'no_need_check_grad': no_need_check_grad,
-                            'op_type': "matmul",
-                        }
-                        classes.append([class_name, attr_dict])
-
-        return base_class, classes
-
-
-class XPUTestMatmulOp3(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "matmul"
-        self.use_dynamic_create_class = True
-
-    def dynamic_create_class(self):
-        base_class = TestMatmulBaseGenerator
-        classes = []
-        for dim in [4]:
-            for transpose_X in [False, True]:
-                for transpose_Y in [False, True]:
-                    class_name = f'TestMatMulOp2_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
-                    shape_X, shape_Y = generate_compatible_shapes_2(
-                        dim, transpose_X, transpose_Y
-                    )
-                    attr_dict = {
-                        'shape_X': shape_X,
-                        'shape_Y': shape_Y,
-                        'transpose_X': transpose_X,
-                        'transpose_Y': transpose_Y,
-                        'op_type': "matmul",
-                    }
-                    classes.append([class_name, attr_dict])
-        return base_class, classes
-
-
-class XPUTestMatmulOpBF16(XPUOpTestWrapper):
-    def __init__(self):
-        self.op_name = "matmul"
-        self.use_dynamic_create_class = True
-
-    def dynamic_create_class(self):
-        base_class = TestMatmulBaseGenerator
-        classes = []
-        for dim in [2]:
-            for transpose_X in [False, True]:
-                for transpose_Y in [False, True]:
-                    class_name = f'TestMatMulOp2_dimX_{dim}_dim_Y_{dim}_transX_{transpose_X}_transY_{transpose_Y}'
-                    shape_X, shape_Y = generate_compatible_shapes_2(
-                        dim, transpose_X, transpose_Y
-                    )
-                    attr_dict = {
-                        'shape_X': shape_X,
-                        'shape_Y': shape_Y,
-                        'transpose_X': transpose_X,
-                        'transpose_Y': transpose_Y,
-                        'op_type': "matmul",
-                    }
-                    classes.append([class_name, attr_dict])
-        return base_class, classes
-
-
-support_types = get_xpu_op_support_types('matmul')
-for stype in support_types:
-    if "bfloat16" in str(stype):
-        # only support fc_fusion now
-        create_test_class(globals(), XPUTestMatmulOpBF16, stype)
-    else:
-        create_test_class(globals(), XPUTestMatmulOpErr, stype)
-        create_test_class(globals(), XPUTestMatmulOp1, stype)
-        create_test_class(globals(), XPUTestMatmulOp3, stype)
-
-if __name__ == "__main__":
-    paddle.enable_static()
-    unittest.main()
diff --git a/test/xpu/test_nearest_interp_op_xpu.py b/test/xpu/test_nearest_interp_op_xpu.py
deleted file mode 100644
index 1165521339da14..00000000000000
--- a/test/xpu/test_nearest_interp_op_xpu.py
+++ /dev/null
@@ -1,441 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-
-paddle.enable_static()
-'''
-def nearest_neighbor_interp_np(X,
-                               out_h,
-                               out_w,
-                               out_size=None,
-                               actual_shape=None,
-                               align_corners=True,
-                               data_layout="NCHW"):
-    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
-    if data_layout == "NHWC":
-        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
-    if out_size is not None:
-        out_h = out_size[0]
-        out_w = out_size[1]
-    if actual_shape is not None:
-        out_h = actual_shape[0]
-        out_w = actual_shape[1]
-    n, c, in_h, in_w = X.shape
-
-    ratio_h = ratio_w = 0.0
-    if (out_h > 1):
-        if (align_corners):
-            ratio_h = (in_h - 1.0) / (out_h - 1.0)
-        else:
-            ratio_h = 1.0 * in_h / out_h
-    if (out_w > 1):
-        if (align_corners):
-            ratio_w = (in_w - 1.0) / (out_w - 1.0)
-        else:
-            ratio_w = 1.0 * in_w / out_w
-
-    out = np.zeros((n, c, out_h, out_w))
-
-    if align_corners:
-        for i in range(out_h):
-            in_i = int(ratio_h * i + 0.5)
-            for j in range(out_w):
-                in_j = int(ratio_w * j + 0.5)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-    else:
-        for i in range(out_h):
-            in_i = int(ratio_h * i)
-            for j in range(out_w):
-                in_j = int(ratio_w * j)
-                out[:, :, i, j] = X[:, :, in_i, in_j]
-
-    if data_layout == "NHWC":
-        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
-
-    return out.astype(X.dtype)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterpOp(XPUOpTest):
-    def setUp(self):
-        self.use_xpu = True
-        self.out_size = None
-        self.actual_shape = None
-        self.data_layout = "NCHW"
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        input_np = np.random.random(self.input_shape).astype("float32")
-
-        if self.data_layout == "NCHW":
-            in_h = self.input_shape[2]
-            in_w = self.input_shape[3]
-        else:
-            in_h = self.input_shape[1]
-            in_w = self.input_shape[2]
-
-        if self.scale > 0:
-            out_h = int(in_h * self.scale)
-            out_w = int(in_w * self.scale)
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
-        self.inputs = {"X": input_np}
-        if self.out_size is not None:
-            self.inputs["OutSize"] = self.out_size
-        if self.actual_shape is not None:
-            self.inputs["OutSize"] = self.actual_shape
-        self.attrs = {
-            "out_h": self.out_h,
-            "out_w": self.out_w,
-            "scale": self.scale,
-            "interp_method": self.interp_method,
-            "align_corners": self.align_corners,
-            "data_layout": self.data_layout
-        }
-        self.outputs = {"Out": output_np}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ["X"], "Out", in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [2, 3, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 3]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase4(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [4, 1, 7, 8]
-        self.out_h = 1
-        self.out_w = 1
-        self.scale = 0.
-        self.out_size = np.array([2, 2]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase5(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = np.array([11, 11]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpCase6(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [1, 1, 32, 64]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([65, 129]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpSame(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [2, 3, 32, 64]
-        self.out_h = 32
-        self.out_w = 64
-        self.scale = 0.
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpDataLayout(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [2, 4, 4, 5]
-        self.out_h = 2
-        self.out_w = 2
-        self.scale = 0.
-        self.out_size = np.array([3, 8]).astype("int32")
-        self.align_corners = True
-        self.data_layout = "NCHW"
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterpWithoutCorners(TestNearestInterpOp):
-    def set_align_corners(self):
-        self.align_corners = False
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpScale1(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpScale2(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 5, 7]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.5
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestNeighborInterpScale3(TestNearestInterpOp):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 7, 5]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 1.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterpOp_attr_tensor(XPUOpTest):
-    def setUp(self):
-        self.out_size = None
-        self.actual_shape = None
-        self.shape_by_1Dtensor = False
-        self.scale_by_1Dtensor = False
-        self.scale_by_2Dtensor = False
-        self.init_test_case()
-        self.op_type = "nearest_interp"
-        self.attrs = {
-            "interp_method": self.interp_method,
-            "align_corners": self.align_corners,
-        }
-
-        input_np = np.random.random(self.input_shape).astype("float32")
-        self.inputs = {"X": input_np}
-
-        if self.scale_by_1Dtensor:
-            self.inputs["Scale"] = np.array([self.scale]).astype("float32")
-            out_h = int(self.input_shape[2] * self.scale)
-            out_w = int(self.input_shape[3] * self.scale)
-        elif self.scale_by_2Dtensor:
-            self.inputs['Scale'] = np.array(self.scale).astype("float32")
-            out_h = int(self.input_shape[2] * self.scale[0])
-            out_w = int(self.input_shape[3] * self.scale[1])
-        else:
-            out_h = self.out_h
-            out_w = self.out_w
-
-        if self.shape_by_1Dtensor:
-            self.inputs["OutSize"] = self.out_size
-        elif self.out_size is not None:
-            size_tensor = []
-            for index, ele in enumerate(self.out_size):
-                size_tensor.append(("x" + str(index), np.ones(
-                    (1)).astype("int32") * ele))
-            self.inputs["SizeTensor"] = size_tensor
-
-        self.attrs["out_h"] = self.out_h
-        self.attrs["out_w"] = self.out_w
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
-                                               self.out_size, self.actual_shape,
-                                               self.align_corners)
-        self.outputs = {"Out": output_np}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        place = paddle.XPUPlace(0)
-        self.check_grad_with_place(place, ["X"], "Out", in_place=True)
-
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [2, 5, 4, 4]
-        self.out_h = 3
-        self.out_w = 3
-        self.scale = 0.
-        self.out_size = [3, 3]
-        self.align_corners = True
-
-
-# out_size is a tensor list
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 3, 9, 6]
-        self.out_h = 12
-        self.out_w = 12
-        self.scale = 0.
-        self.out_size = [8, 12]
-        self.align_corners = True
-
-
-# out_size is a 1-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 0.
-        self.out_size = np.array([66, 40]).astype("int32")
-        self.align_corners = True
-        self.shape_by_1Dtensor = True
-
-
-# scale is a 1-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = "nearest"
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = 2.0
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_1Dtensor = True
-
-
-# scale is a 2-D tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterp_attr_tensor_Case4(TestNearestInterpOp_attr_tensor):
-    def init_test_case(self):
-        self.interp_method = 'nearest'
-        self.input_shape = [3, 2, 32, 16]
-        self.out_h = 64
-        self.out_w = 32
-        self.scale = [2.0, 2.0]
-        self.out_size = None
-        self.align_corners = True
-        self.scale_by_2Dtensor = True
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestNearestInterpException(unittest.TestCase):
-    def test_exception(self):
-        input = paddle.static.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
-
-        def attr_data_format():
-            # for 4-D input, data_format can only be NCHW or NHWC
-            out = base.layers.resize_nearest(
-                input, out_shape=[4, 8], data_format="NDHWC")
-
-        def attr_scale_type():
-            out = base.layers.resize_nearest(input, scale="scale")
-
-        def attr_scale_value():
-            out = base.layers.resize_nearest(input, scale=-0.3)
-
-        self.assertRaises(ValueError, attr_data_format)
-        self.assertRaises(TypeError, attr_scale_type)
-        self.assertRaises(ValueError, attr_scale_value)
-'''
-
-if __name__ == "__main__":
-    unittest.main()

From 0fc0aa5d92fd9763dc41f40e0c387f12b3218b81 Mon Sep 17 00:00:00 2001
From: AyaseNana <49900969+NKNaN@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:20:52 +0800
Subject: [PATCH 11/16] [Typing][B-28] Add type annotations for
 `python/paddle/distribution/uniform.py` (#65660)

---------

Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 python/paddle/distribution/uniform.py | 36 ++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index b9b4cf1e334803..cefbeef9c60433 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -11,8 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Sequence
 
 import numpy as np
+import numpy.typing as npt
 
 import paddle
 from paddle import _C_ops
@@ -22,6 +26,9 @@
 from paddle.framework import in_dynamic_mode
 from paddle.tensor import random
 
+if TYPE_CHECKING:
+    from paddle import Tensor
+
 
 class Uniform(distribution.Distribution):
     r"""Uniform distribution with `low` and `high` parameters.
@@ -99,8 +106,25 @@ class Uniform(distribution.Distribution):
             Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
                 [0.50000000])
     """
-
-    def __init__(self, low, high, name=None):
+    low: Tensor
+    high: Tensor
+
+    def __init__(
+        self,
+        low: (
+            float
+            | Sequence[float]
+            | npt.NDArray[np.float32 | np.float64]
+            | Tensor
+        ),
+        high: (
+            float
+            | Sequence[float]
+            | npt.NDArray[np.float32 | np.float64]
+            | Tensor
+        ),
+        name: str | None = None,
+    ) -> None:
         if not in_dynamic_mode():
             check_type(
                 low,
@@ -165,7 +189,7 @@ def __init__(self, low, high, name=None):
 
         super().__init__(self.low.shape)
 
-    def sample(self, shape, seed=0):
+    def sample(self, shape: list[int], seed: int = 0) -> Tensor:
         """Generate samples of the specified shape.
 
         Args:
@@ -218,7 +242,7 @@ def sample(self, shape, seed=0):
             else:
                 return output
 
-    def log_prob(self, value):
+    def log_prob(self, value: Tensor) -> Tensor:
         """Log probability density/mass function.
 
         Args:
@@ -247,7 +271,7 @@ def log_prob(self, value):
                 paddle.log(lb * ub), paddle.log(self.high - self.low), name=name
             )
 
-    def probs(self, value):
+    def probs(self, value: Tensor) -> Tensor:
         """Probability density/mass function.
 
         Args:
@@ -272,7 +296,7 @@ def probs(self, value):
             ub = paddle.cast(ub_bool, dtype=value.dtype)
             return paddle.divide((lb * ub), (self.high - self.low), name=name)
 
-    def entropy(self):
+    def entropy(self) -> Tensor:
         r"""Shannon entropy in nats.
 
         The entropy is

From a72432c1885c048b01d68f453f77ad613838f93f Mon Sep 17 00:00:00 2001
From: ooo oo <106524776+ooooo-create@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:21:32 +0800
Subject: [PATCH 12/16] [Typing][A-100] Add type annotations for
 `python/paddle/io/dataloader/worker.py` (#65645)

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 python/paddle/io/dataloader/worker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py
index a559a616bb2963..b1284a646f656c 100644
--- a/python/paddle/io/dataloader/worker.py
+++ b/python/paddle/io/dataloader/worker.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import os
 import queue
@@ -76,7 +77,7 @@ def is_alive(self):
 _worker_info = None
 
 
-def get_worker_info():
+def get_worker_info() -> WorkerInfo | None:
     """
     Get DataLoader worker process information function, this function is
     used to split data copy in worker process for IterableDataset
@@ -117,8 +118,8 @@ def get_worker_info():
             ...         else:
             ...             per_worker = int(
             ...                 math.ceil((self.end - self.start) / float(
-            ...                     worker_info.num_workers)))
-            ...             worker_id = worker_info.id
+            ...                     worker_info.num_workers)))  # type: ignore[attr-defined]
+            ...             worker_id = worker_info.id  # type: ignore[attr-defined]
             ...             iter_start = self.start + worker_id * per_worker
             ...             iter_end = min(iter_start + per_worker, self.end)
             ...

From 1b9663140806092791da3ae333e53211cdf67fdc Mon Sep 17 00:00:00 2001
From: megemini <megemini@outlook.com>
Date: Thu, 4 Jul 2024 15:25:21 +0800
Subject: [PATCH 13/16] =?UTF-8?q?[Typing]=20=E4=BF=AE=E7=90=86=E9=83=A8?=
 =?UTF-8?q?=E5=88=86=E7=A4=BA=E4=BE=8B=E4=B8=AD=E7=9A=84=E7=B1=BB=E5=9E=8B?=
 =?UTF-8?q?=E7=BC=BA=E5=A4=B1=20`var-annotated`=20=E4=BB=A5=E5=8F=8A?=
 =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=85=B7=E4=BD=93=20ignore=20=E7=B1=BB?=
 =?UTF-8?q?=E5=9E=8B=20(#65644)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/paddle/amp/debugging.py                           | 2 +-
 python/paddle/base/layers/math_op_patch.py               | 2 +-
 python/paddle/distributed/communication/stream/gather.py | 2 +-
 python/paddle/distributed/parallel.py                    | 2 +-
 python/paddle/optimizer/lbfgs.py                         | 6 +++---
 python/paddle/tensor/attribute.py                        | 2 +-
 python/paddle/tensor/creation.py                         | 2 +-
 python/paddle/vision/transforms/transforms.py            | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index 1b6e575cdbec98..8b4340a2c49359 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -88,7 +88,7 @@ def check_layer_numerics(func):
             ...         return x @ self._w + self._b
             ...
             >>> dtype = 'float32'
-            >>> x = paddle.rand([10, 2, 2], dtype=dtype) # type: ignore
+            >>> x = paddle.rand([10, 2, 2], dtype=dtype) # type: ignore[arg-type]
             >>> model = MyLayer(dtype)
             >>> x[0] = float(0)
             >>> loss = model(x)
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index 1dd3c19b44d9aa..dfd4c802d89f96 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -352,7 +352,7 @@ def astype(self, dtype):
                 >>> import paddle
                 >>> import numpy as np
 
-                >>> x = np.ones([2, 2], np.float32)
+                >>> x = np.ones([2, 2], np.float32) # type: ignore[var-annotated]
                 >>> with base.dygraph.guard():
                 ...     original_variable = paddle.to_tensor(x)
                 ...     print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
diff --git a/python/paddle/distributed/communication/stream/gather.py b/python/paddle/distributed/communication/stream/gather.py
index c0405ec696bc0e..45b86b0215e0f8 100644
--- a/python/paddle/distributed/communication/stream/gather.py
+++ b/python/paddle/distributed/communication/stream/gather.py
@@ -83,7 +83,7 @@ def gather(
             >>> import paddle.distributed as dist
 
             >>> dist.init_parallel_env()
-            >>> gather_list = []
+            >>> gather_list = [] # type: ignore[var-annotated]
             >>> if dist.get_rank() == 0:
             ...     data = paddle.to_tensor([1, 2, 3])
             ...     dist.stream.gather(data, gather_list, dst=0)
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 0d905b4f5d9856..791f8834c37a62 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -334,7 +334,7 @@ class DataParallel(layers.Layer):
             ...     model = paddle.DataParallel(model)
             ...     opt = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
             ...     for step in range(10):
-            ...         x_data = numpy.random.randn(2, 2).astype(numpy.float32)
+            ...         x_data = numpy.random.randn(2, 2).astype(numpy.float32) # type: ignore[var-annotated]
             ...         x = paddle.to_tensor(x_data)
             ...         x.stop_gradient = False
             ...         # step 1 : skip gradient synchronization by 'no_sync'
diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py
index a0198048ecfea0..5a41e119f08bf6 100644
--- a/python/paddle/optimizer/lbfgs.py
+++ b/python/paddle/optimizer/lbfgs.py
@@ -399,10 +399,10 @@ class LBFGS(Optimizer):
 
             >>> paddle.disable_static()
             >>> np.random.seed(0)
-            >>> np_w = np.random.rand(1).astype(np.float32)  # type: ignore
-            >>> np_x = np.random.rand(1).astype(np.float32)  # type: ignore
+            >>> np_w = np.random.rand(1).astype(np.float32)  # type: ignore[var-annotated]
+            >>> np_x = np.random.rand(1).astype(np.float32)  # type: ignore[var-annotated]
 
-            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]  # type: ignore
+            >>> inputs = [np.random.rand(1).astype(np.float32) for i in range(10)]  # type: ignore[var-annotated]
             >>> # y = 2x
             >>> targets = [2 * x for x in inputs]
 
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 2a0f4f5df2eed1..d4d35bcb1e05a6 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -102,7 +102,7 @@ def shape(input: Tensor) -> Tensor:
             >>> exe = paddle.static.Executor(paddle.CPUPlace())
             >>> exe.run(paddle.static.default_startup_program())
 
-            >>> img = np.ones((3, 100, 100)).astype(np.float32) # type: ignore
+            >>> img = np.ones((3, 100, 100)).astype(np.float32) # type: ignore[var-annotated]
 
             >>> res = exe.run(paddle.static.default_main_program(), feed={'x':img}, fetch_list=[output])
             >>> print(res)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 506525d1e2e49c..8e6635a641f623 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -2483,7 +2483,7 @@ def assign(x: TensorLike, output: paddle.Tensor | None = None) -> paddle.Tensor:
              [2.5 2.5]]
             >>> array = np.array([[1, 1], [3, 4], [1, 3]]).astype(
             ...     np.int64
-            ... )  # type: ignore
+            ... )  # type: ignore[var-annotated]
             >>> result1 = paddle.zeros(shape=[3, 3], dtype='float32')
             >>> paddle.assign(array, result1)
             >>> print(result1.numpy())
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index bf5fc470e87f3b..2e25cbc76e1643 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -232,7 +232,7 @@ class BaseTransform(_Transform[_InputT, _RetT]):
             ...     else:
             ...         raise TypeError("Unexpected type {}".format(type(img)))
             ...
-            >>> class CustomRandomFlip(BaseTransform): # type: ignore
+            >>> class CustomRandomFlip(BaseTransform): # type: ignore[type-arg]
             ...     def __init__(self, prob=0.5, keys=None):
             ...         super().__init__(keys)
             ...         self.prob = prob

From c517ecd766a2ebcf774f572c879b084a810c4f6d Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Thu, 4 Jul 2024 15:26:25 +0800
Subject: [PATCH 14/16] [Dockerfile][DCU][XPU] add develop dockerfile for dcu
 and xpu (#65654)

* [DCU][XPU] add develop dockerfile for dcu and xpu

* update comments
---
 tools/dockerfile/Dockerfile.develop.dtk | 108 +++++++++++++++++
 tools/dockerfile/Dockerfile.develop.xre | 113 +++++++++++++++++
 tools/dockerfile/Dockerfile.rocm        | 153 ------------------------
 3 files changed, 221 insertions(+), 153 deletions(-)
 create mode 100644 tools/dockerfile/Dockerfile.develop.dtk
 create mode 100644 tools/dockerfile/Dockerfile.develop.xre
 delete mode 100644 tools/dockerfile/Dockerfile.rocm

diff --git a/tools/dockerfile/Dockerfile.develop.dtk b/tools/dockerfile/Dockerfile.develop.dtk
new file mode 100644
index 00000000000000..20a7390f38de63
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.develop.dtk
@@ -0,0 +1,108 @@
+# Docker Image for PaddlePaddle Hygon DCU2
+
+FROM sugonhub/kylin:v10-dev
+LABEL maintainer="PaddlePaddle Authors <paddle-dev@baidu.com>"
+
+RUN yum install -y bzip2-devel openssh-server elfutils-devel diffutils libtool iproute \
+        blas-devel lapack-devel make git patch unzip bison hostname yasm libsndfile-devel \
+        automake which file net-tools zlib-devel libffi-devel vim tk-devel tkinter rpm-build \
+        sqlite-devel xz-devel wget curl-devel initscripts mesa-libGL numactl-devel pcre-devel \
+        openssl-devel libjpeg-turbo-devel libpng-devel ninja-build pciutils libzstd-devel \
+        gcc gcc-c++ gcc-gfortran
+
+# workdir
+WORKDIR /opt
+
+# cmake 3.27.7
+RUN wget -q https://cmake.org/files/v3.27/cmake-3.27.7-linux-x86_64.sh && \
+    chmod +x cmake-3.27.7-linux-x86_64.sh && mkdir -p /opt/cmake-3.27.7 && \
+    ./cmake-3.27.7-linux-x86_64.sh --prefix=/opt/cmake-3.27.7 --skip-license && \
+    rm -rf cmake-3.27.7-linux-x86_64.sh && rm -rf /opt/cmake
+RUN rm -rf /usr/bin/cmake /usr/bin/cmake3 && \
+    ln -s /opt/cmake-3.27.7/bin/cmake /usr/bin/cmake &&
+    ln -s /opt/cmake-3.27.7/bin/cmake /usr/bin/cmake3
+ENV PATH=/opt/cmake-3.27.7/bin:${PATH}
+
+# Python 3.10.14
+RUN wget -q https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz && \
+    tar xzf Python-3.10.14.tgz && cd Python-3.10.14 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j16 > /dev/null && make altinstall > /dev/null && ldconfig && \
+    cd ../ && rm -rf Python-3.10.14 && rm -rf Python-3.10.14.tgz
+ENV LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/include/python3.10:${CPLUS_INCLUDE_PATH}
+
+# create venv and activate
+RUN /usr/local/bin/python3.10 -m venv /opt/py310
+# update env
+ENV PATH=/opt/py310/bin:$PATH
+RUN echo "source /opt/py310/bin/activate" >> /root/.bashrc
+# upgrade pip
+RUN pip install --upgrade pip setuptools wheel
+
+# install pylint and pre-commit
+RUN pip install pre-commit==2.17.0 pylint pytest astroid isort coverage qtconsole distro
+RUN pip install attrs pyyaml pathlib2 scipy requests psutil Cython clang-format==13.0.0 PyGithub
+
+# install Paddle requirement
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O requirements.txt && \
+    pip install -r requirements.txt && rm -rf requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O requirements.txt && \
+    pip install -r requirements.txt && rm -rf requirements.txt
+
+# git credential to skip password typing
+RUN git config --global credential.helper store && \
+    git config --global pull.rebase false
+
+# Fix locales to en_US.UTF-8
+RUN yum -y install glibc-locale-source glibc-langpack-en
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# patchelf 0.14.5 - https://github.com/NixOS/patchelf/pull/216
+RUN wget -q https://github.com/NixOS/patchelf/archive/refs/tags/0.14.5.tar.gz && \
+    tar xzf 0.14.5.tar.gz && cd patchelf-0.14.5 && \
+    ./bootstrap.sh  > /dev/null && ./configure > /dev/null && \
+    make -j16 > /dev/null && make install > /dev/null && \
+    cd .. && rm -rf patchelf-0.14.5 && rm -rf 0.14.5.tar.gz
+
+# ccache 4.6.3
+RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6.3.tar.gz && \
+    tar xf ccache-4.6.3.tar.gz && mkdir /usr/local/ccache-4.6.3 && cd ccache-4.6.3 && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DREDIS_STORAGE_BACKEND=OFF \
+    -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.6.3 ..  > /dev/null && \
+    make -j16 > /dev/null && make install > /dev/null && \
+    cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
+    ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
+ENV CCACHE_MAXSIZE=50G \
+    CCACHE_LIMIT_MULTIPLE=0.8 \
+    CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime
+
+# configure ssh
+RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
+    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
+RUN ssh-keygen -A
+
+# yum clean
+RUN yum clean all && \
+    rm -rf /var/cache/yum && \
+    rm -rf /var/lib/yum/yumdb && \
+    rm -rf /var/lib/yum/history
+
+# Install DTK
+RUN wget -q https://cancon.hpccube.com:65024/file/1/DTK-24.04.1/CentOS7.6/DTK-24.04.1-CentOS7.6-x86_64.tar.gz && \
+    tar zxf DTK-24.04.1-CentOS7.6-x86_64.tar.gz && rm -rf DTK-24.04.1-CentOS7.6-x86_64.tar.gz
+# Replace if you use other device type, e.g. Z100, Z100L, K100
+RUN wget -q https://paddle-device.bj.bcebos.com/dcu/hyhal-K100AI.tar.gz && \
+    tar zxf hyhal-K100AI.tar.gz && rm -rf hyhal-K100AI.tar.gz
+RUN echo "source /opt/dtk-24.04.1/env.sh" >> /root/.bashrc
+# Disable compile warnings
+RUN sed -i '74d' /opt/dtk-24.04.1/include/rocrand/rocrand_common.h
+
+# generate core dump
+RUN echo "kernel.core_pattern=core_%e_%p_%t" >>  /etc/sysctl.conf && \
+    echo "kernel.core_uses_pid=0" >>  /etc/sysctl.conf
+
+EXPOSE 22
diff --git a/tools/dockerfile/Dockerfile.develop.xre b/tools/dockerfile/Dockerfile.develop.xre
new file mode 100644
index 00000000000000..b8913ad376a41f
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.develop.xre
@@ -0,0 +1,113 @@
+# Docker Image for PaddlePaddle Kunlun XPU
+
+FROM ubuntu:20.04
+LABEL maintainer="PaddlePaddle Authors <paddle-dev@baidu.com>"
+
+RUN apt-get update && apt-get install -y apt-utils
+RUN ln -snf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
+RUN apt-get update && apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && add-apt-repository ppa:ubuntu-toolchain-r/test
+RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar ntp xz-utils libssl-dev bzip2 gzip make automake \
+    coreutils language-pack-zh-hans libsm6 libxext6 libxrender-dev libgl1-mesa-glx libsqlite3-dev libopenblas-dev liblapack3 \
+    bison libjpeg-dev zlib1g zlib1g-dev swig locales net-tools libtool numactl libnuma-dev liblzma-dev libbz2-dev libblas-dev \
+    openssl openssh-server libffi-dev pciutils libblas3 liblapack-dev libzstd-dev default-jre libgcc-s1 gcc g++ gfortran gdb
+
+# workdir
+WORKDIR /opt
+
+# GCC 8.4
+RUN apt-get install -y gcc-8 g++-8 gfortran-8
+RUN update-alternatives --install /usr/bin/g++  g++  /usr/bin/g++-8  90 && \
+    update-alternatives --install /usr/bin/gcc  gcc  /usr/bin/gcc-8  90 && \
+    update-alternatives --install /usr/bin/gfortran  gfortran  /usr/bin/gfortran-8  90
+
+# cmake 3.27.7
+RUN wget -q https://cmake.org/files/v3.27/cmake-3.27.7-linux-x86_64.sh && \
+    chmod +x cmake-3.27.7-linux-x86_64.sh && mkdir -p /opt/cmake-3.27.7 && \
+    ./cmake-3.27.7-linux-x86_64.sh --prefix=/opt/cmake-3.27.7 --skip-license && \
+    rm -rf cmake-3.27.7-linux-x86_64.sh
+ENV PATH=/opt/cmake-3.27.7/bin:${PATH}
+
+# default python version
+ARG PY_VERSION=3.10
+RUN apt-get install -y python3-distutils python${PY_VERSION} python${PY_VERSION}-dev
+
+# install pip
+RUN curl -s -q https://bootstrap.pypa.io/get-pip.py | /usr/bin/python${PY_VERSION}
+
+# set default python
+RUN rm -rf /usr/bin/python3 && ln -s /usr/bin/python${PY_VERSION} /usr/bin/python3 && \
+    rm -rf /usr/bin/python  && ln -s /usr/bin/python${PY_VERSION} /usr/bin/python
+
+# install pylint and pre-commit
+RUN pip install pre-commit==2.17.0 pylint pytest astroid isort coverage qtconsole distro
+RUN pip install attrs pyyaml pathlib2 scipy requests psutil Cython clang-format==13.0.0
+
+# add more libs
+RUN apt-get update && apt-get install libprotobuf-dev protobuf-compiler libprotoc-dev lsof libgeos-dev \
+    pkg-config libhdf5-103 libhdf5-dev lrzsz libsndfile1 tree ninja-build -y
+
+# install Paddle requirement
+RUN wget --no-check-certificate https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O requirements.txt && \
+    pip install -r requirements.txt -i https://pip.baidu-int.com/simple --trusted-host pip.baidu-int.com && rm -rf requirements.txt
+RUN wget --no-check-certificate https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O requirements.txt && \
+    pip install -r requirements.txt -i https://pip.baidu-int.com/simple --trusted-host pip.baidu-int.com && rm -rf requirements.txt
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# patchelf 0.14.5 - https://github.com/NixOS/patchelf/pull/216
+RUN wget -q --no-check-certificate https://github.com/NixOS/patchelf/archive/refs/tags/0.14.5.tar.gz && \
+    tar xzf 0.14.5.tar.gz && cd patchelf-0.14.5 && \
+    ./bootstrap.sh  > /dev/null && ./configure > /dev/null && \
+    make -j16 > /dev/null && make install > /dev/null && \
+    cd .. && rm -rf patchelf-0.14.5 && rm -rf 0.14.5.tar.gz
+
+# ccache 4.6.3
+RUN wget -q https://github.com/ccache/ccache/releases/download/v4.6.3/ccache-4.6.3.tar.gz && \
+    tar xf ccache-4.6.3.tar.gz && mkdir /usr/local/ccache-4.6.3 && cd ccache-4.6.3 && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DREDIS_STORAGE_BACKEND=OFF \
+    -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.6.3 ..  > /dev/null && \
+    make -j16 > /dev/null && make install > /dev/null && \
+    cd ../../ && rm -rf ccache-4.6.3.tar.gz && rm -rf ccache-4.6.3 && \
+    ln -s /usr/local/ccache-4.6.3/bin/ccache /usr/local/bin/ccache
+ENV CCACHE_MAXSIZE=80G \
+    CCACHE_LIMIT_MULTIPLE=0.8 \
+    CCACHE_SLOPPINESS=clang_index_store,time_macros,include_file_mtime
+
+# Install XRE 4.31.0
+ARG XRE_VERSION=4.31.0
+ARG XRE_INSTALL=/usr/local/xpu-${XRE_VERSION}
+RUN wget -q https://klx-sdk-release-public.su.bcebos.com/xre/release/${XRE_VERSION}.1/xre-ubuntu_2004_x86_64.tar.gz && \
+    tar -zxf xre-ubuntu_2004_x86_64.tar.gz && \
+    mkdir -p ${XRE_INSTALL} && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/bin/ ${XRE_INSTALL}/ && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/include/ ${XRE_INSTALL}/ && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/tools/ ${XRE_INSTALL}/ && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/version.txt ${XRE_INSTALL}/ && \
+    mkdir -p ${XRE_INSTALL}/lib64 && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/lib/* ${XRE_INSTALL}/lib64/ && \
+    cp -af /opt/xre-ubuntu_2004_x86_64/so/* ${XRE_INSTALL}/lib64/ && \
+    ln -sf ${XRE_INSTALL} /usr/local/xpu && \
+    ln -sf ${XRE_INSTALL}/bin/xpu_smi /usr/local/bin/xpu_smi && \
+    rm -rf xre-ubuntu_2004_x86_64.tar.gz && rm -rf xre-ubuntu_2004_x86_64/
+ENV PATH=${XRE_INSTALL}/bin:$PATH
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && \
+    sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+CMD source ~/.bashrc
+
+# /proc/sys/kernel/core_pattern
+RUN mkdir -p /var/core
+
+# Clean
+RUN apt-get clean -y
+RUN pip cache purge
+
+EXPOSE 22
diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm
deleted file mode 100644
index 9b3e6c4b2f123d..00000000000000
--- a/tools/dockerfile/Dockerfile.rocm
+++ /dev/null
@@ -1,153 +0,0 @@
-# A image for building paddle binaries
-# Use rocm-terminal base image for both rocm environment
-# When you modify it, please be aware of rocm version
-#
-# Build: ROCM 4.0.1
-# cd Paddle/tools/dockerfile
-# docker build -f Dockerfile.rocm  \
-#        -t paddlepaddle/paddle-centos-rocm401-dev:latest .
-#
-# docker run -it --device=/dev/kfd --device=/dev/dri \
-# --security-opt seccomp=unconfined --group-add video \
-# paddlepaddle/paddle-centos-rocm401-dev:latest /bin/bash
-
-FROM centos:7.8.2003
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-
-RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
-        zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
-        make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
-        net-tools numactl-devel chrpath screen initscripts
-
-# Install devtoolset-7
-RUN yum install -y yum-utils centos-release-scl && \
-    yum-config-manager --enable rhel-server-rhscl-7-rpms && \
-    yum-config-manager --enable rhel-7-server-rpms && \
-    yum-config-manager --enable rhel-7-server-optional-rpms && \
-    INSTALL_PKGS="devtoolset-7-binutils devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-gdb" && \
-    yum install -y --setopt=tsflags=nodocs $INSTALL_PKGS && \
-    rpm -V $INSTALL_PKGS && \
-    yum -y clean all --enablerepo='*'
-ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
-RUN echo "source scl_source enable devtoolset-7" > "/etc/profile.d/devtoolset-7.sh"
-
-# cmake 3.16.0
-WORKDIR /opt
-RUN wget -q https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.tar.gz && \
-    tar -zxvf cmake-3.18.0-Linux-x86_64.tar.gz && rm cmake-3.18.0-Linux-x86_64.tar.gz && \
-    mv cmake-3.18.0-Linux-x86_64 cmake-3.16
-ENV PATH=/opt/cmake-3.18/bin:${PATH}
-
-# ROCM
-RUN yum install -y kmod wget openblas-devel epel-release
-RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \
-    echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo && \
-    echo "baseurl=http://repo.radeon.com/rocm/yum/4.0.1" >> /etc/yum.repos.d/rocm.repo && \
-    echo "enabled=1" >> /etc/yum.repos.d/rocm.repo && \
-    echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
-RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev
-# fix rocthrust
-RUN sed -i '21 a #include <thrust/system/hip/config.h>' /opt/rocm/include/thrust/system/hip/detail/error.inl
-# export ROCM env
-ENV ROCM_PATH=/opt/rocm
-ENV HIP_PATH=/opt/rocm/hip
-ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
-ENV PATH=/opt/rocm/bin:$PATH
-ENV PATH=/opt/rocm/opencl/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
-
-# git 2.17.1
-RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
-  tar -xvf git-2.17.1.tar.gz && \
-  cd git-2.17.1 && \
-  ./configure --with-openssl --prefix=/usr/local && \
-  make -j8 && make install && \
-  cd .. && rm -rf git-2.17.1.tar.gz && rm -rf git-2.17.1
-
-ENV GOROOT=/usr/local/go
-ENV GOPATH=/root/gopath
-ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
-
-# go 1.8.1
-RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-
-# protobuf 3.6.1
-RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/protobuf-cpp-3.6.1.tar.gz && \
-    tar xzf protobuf-cpp-3.6.1.tar.gz && \
-    cd protobuf-3.6.1 && ./configure && make -j4 && make install && \
-    cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
-
-# conda
-ENV CONDA_FILE=Miniconda3-py38_23.10.0-1-Linux-x86_64.sh
-RUN cd /opt && wget https://repo.anaconda.com/miniconda/${CONDA_FILE} && chmod +x ${CONDA_FILE}
-RUN mkdir /opt/conda && ./${CONDA_FILE} -b -f -p "/opt/conda" && rm -rf ${CONDA_FILE}
-ENV PATH=/opt/conda/bin:${PATH}
-RUN conda init bash && conda install -n base jupyter jupyterlab
-
-# install pytest and pre-commit
-RUN /opt/conda/bin/pip install pre-commit pytest protocol PyGithub
-
-# install Paddle requirement
-RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
-RUN /opt/conda/bin/pip install -r /root/requirements.txt && \
-    rm -rf /root/requirements.txt
-
-RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt
-RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
-
-# install PaddleClas requirement
-RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleClas/develop/requirements.txt -O /root/requirements.txt
-RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
-
-# install PaddleDetection requirement
-RUN wget https://raw.githubusercontent.com/PaddlePaddle/PaddleDetection/develop/requirements.txt -O /root/requirements.txt
-RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
-
-# configure ssh
-RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
-    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
-    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config
-
-# clang-format 3.8
-RUN wget https://copr.fedorainfracloud.org/coprs/alonid/llvm-3.8.0/repo/epel-7/alonid-llvm-3.8.0-epel-7.repo -P /etc/yum.repos.d/
-RUN yum install -y clang-3.8.0
-ENV PATH=/opt/llvm-3.8.0/bin:${PATH}
-
-# patchelf
-RUN yum install -y patchelf && \
-    yum clean all && \
-    rm -rf /var/cache/yum && \
-    rm -rf /var/lib/yum/yumdb && \
-    rm -rf /var/lib/yum/history
-
-# swig 2.0.12
-RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
-    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && \
-    cd /opt && rm swig-2.0.12.tar.gz && rm -rf swig-2.0.12
-
-# ccache 3.7.9
-RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
-    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
-    ./configure -prefix=/usr/local/ccache-3.7.9 && \
-    make -j8 && make install && \
-    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
-    cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
-
-# configure ssh
-RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
-    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
-    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
-    sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
-
-RUN ssh-keygen -A
-
-EXPOSE 22

From 9ce1226330b935c8b4ab7353bf01f089ef497c6d Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:37:59 +0800
Subject: [PATCH 15/16] [CINN]Fix AddNOpInferSymbolicShape check (#65672)

---
 .../fluid/pir/dialect/operator/ir/manual_op.cc   | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 3e81ae4cfe69f5..0f08f02fa2a6e9 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -252,7 +252,6 @@ bool AddNOpInferSymbolicShape(pir::Operation *op,
           "should be larger than 0. But received X's dimensions %d.",
           inputs_shape.size()));
   symbol::TensorShapeOrDataDimExprs candidate_shape = inputs_shape.front();
-  size_t candidate_idx = 0;
   for (size_t i = 1; i < inputs_shape.size(); ++i) {
     // 0D tensor
     if (inputs_shape[i].shape().size() == 0) {
@@ -260,19 +259,12 @@ bool AddNOpInferSymbolicShape(pir::Operation *op,
     }
     if (candidate_shape.shape().size() == 0) {
       candidate_shape = inputs_shape[i];
-      candidate_idx = i;
       continue;
     }
-    PADDLE_ENFORCE_EQ(candidate_shape,
-                      inputs_shape[i],
-                      common::errors::InvalidArgument(
-                          "The input tensor X of AddNOp must"
-                          " have same shape. But received X[%d]'s shape = "
-                          "[%s], X[%d]'s shape = [%s].",
-                          candidate_idx,
-                          candidate_shape,
-                          i,
-                          inputs_shape[i]));
+    for (size_t j = 0; j < candidate_shape.shape().size(); ++j) {
+      infer_context->AddEqualCstr(candidate_shape.shape()[j],
+                                  inputs_shape[i].shape()[j]);
+    }
   }
   infer_context->SetShapeOrDataForValue(
       op->result(0), symbol::ShapeOrDataDimExprs{candidate_shape});

From 7f88ea356b675ece47ce8b2b46336214881fcdcb Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 4 Jul 2024 15:43:09 +0800
Subject: [PATCH 16/16] update ci case for auto_parallel, change model from gpt
 to llama (#65676)

---
 tools/auto_parallel/ci_auto_parallel.sh  | 32 ++++--------------------
 tools/auto_parallel/target_path_lists.sh |  7 +-----
 2 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh
index 2fbb47ec371124..6145eaf42e9169 100644
--- a/tools/auto_parallel/ci_auto_parallel.sh
+++ b/tools/auto_parallel/ci_auto_parallel.sh
@@ -68,7 +68,7 @@ for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{pri
         # while the other tests of llama model will be executed in PR-CI-Auto-Parallel.
         for ((i=0; i<${#target_lists_for_semi_auto_ci[@]}; i++)); do
             if [[ $i != ${test_auto_num} ]] && [[ ${file_item} == *${target_lists_for_semi_auto_ci[i]}* ]];then
-                case_list[${#case_list[*]}]=gpt-3_auto
+                case_list[${#case_list[*]}]=llama_auto
                 case_list[${#case_list[*]}]="llama_auto_unit_test"
                 break
             elif [[ $i == ${test_auto_num} ]] && [[ ${file_item} == *${target_lists_for_semi_auto_ci[i]}* ]];then
@@ -78,14 +78,6 @@ for file_name in `git diff --numstat upstream/${AGILE_COMPILE_BRANCH} |awk '{pri
                 continue
             fi
         done
-        for ((i=0; i<${#target_lists_for_pir_ci[@]}; i++)); do
-            if [[ ${file_item} == *${target_lists_for_pir_ci[i]}* ]];then
-                case_list[${#case_list[*]}]=gpt-3_auto_pir
-                break
-            else
-                continue
-            fi
-        done
         # The dynamic unittests have been monitored in PR-CI-Distribute-stable
         # and will be no longer redundantly executed in PR-CI-Auto-Parallel.
         for ((i=0; i<${#target_lists_for_dygraph_ci[@]}; i++)); do
@@ -120,14 +112,6 @@ fi
 get_diff_TO_case
 # Remove duplicates and store the results back to the original list
 
-####################
-if [[ "${case_list[*]}" == *"gpt-3_auto"* ]] && [[ "${case_list[*]}" == *"gpt-3_auto_pir"* ]]; then
-    echo "同时命中gpt-3_auto 和 gpt-3_auto_pir, 只执行新ir, 不执行旧ir"
-    case_list=("${case_list[@]/*gpt-3_auto_pir*/}")
-    case_list=("${case_list[@]/*gpt-3_auto*/}")
-    case_list[${#case_list[*]}]=gpt-3_auto_pir
-    echo ${case_list[*]}
-fi
 ####################
 case_list=($(awk -v RS=' ' '!a[$1]++' <<< ${case_list[*]}))
 if [[ ${#case_list[*]} -ne 0 ]];then
@@ -142,17 +126,11 @@ if [[ ${#case_list[*]} -ne 0 ]];then
     export FLAGS_install_deps=0
     for case in ${case_list[*]};do
         echo -e "\033[31m ---- running case $case_num/${#case_list[*]}: ${case} \033"
-        if [[ ${case} == "gpt-3_auto" ]];then
-            bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
-            print_info $? `ls -lt ${log_path} | grep "gpt" | grep -v "pir" | head -n 1 | awk '{print $9}'` ${case}
-            export FLAGS_install_deps=1
-            export FLAGS_download_data="gpt ""$FLAGS_download_data"
-            let case_num++
-        elif [[ ${case} == "gpt-3_auto_pir" ]];then
-            bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh gpt_case_list_auto_pir $FLAGS_install_deps $FLAGS_download_data
-            print_info $? `ls -lt ${log_path} | grep "pir" | head -n 1 | awk '{print $9}'` ${case}
+        if [[ ${case} == "llama_auto" ]];then
+            bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
+            print_info $? `ls -lt ${log_path} | grep "llama" | head -n 1 | awk '{print $9}'` ${case}
             export FLAGS_install_deps=1
-            export FLAGS_download_data="gpt ""$FLAGS_download_data"
+            export FLAGS_download_data="llama ""$FLAGS_download_data"
             let case_num++
         elif [[ ${case} == "auto_unit_test" ]];then
             bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh auto_unit_test
diff --git a/tools/auto_parallel/target_path_lists.sh b/tools/auto_parallel/target_path_lists.sh
index 033479e7d9a576..fdf5419aafb053 100644
--- a/tools/auto_parallel/target_path_lists.sh
+++ b/tools/auto_parallel/target_path_lists.sh
@@ -10,7 +10,7 @@
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and 
+# See the License for the specific language governing permissions and
 # limitations under the License.
 
 target_lists_for_semi_auto_ci=(
@@ -25,15 +25,10 @@ target_lists_for_semi_auto_ci=(
     "paddle/phi/api/generator/dist_bw_api_gen.py"
     "tools/auto_parallel/target_path_lists.sh"
     "test/auto_parallel"
-)
-
-target_lists_for_pir_ci=(
-    "paddle/fluid/framework/new_executor"
     "paddle/fluid/ir_adaptor/"
     "paddle/fluid/pir/dialect"
     "paddle/fluid/pir/transforms"
     "paddle/pir"
-    "tools/auto_parallel/target_path_lists.sh"
 )
 
 target_lists_for_dygraph_ci=(