diff --git a/.github/workflows/Slice-baseline.yml b/.github/workflows/Slice-baseline.yml
index ca544625a45eb9..4ab346a7a2a4dc 100644
--- a/.github/workflows/Slice-baseline.yml
+++ b/.github/workflows/Slice-baseline.yml
@@ -2,6 +2,13 @@ name: Slice-baseline-paddle
 
 on:
   workflow_dispatch:
+    inputs:
+      PR_ID:
+        required: false
+        type: string
+      COMMIT_ID:
+        required: false
+        type: string
   schedule:
     - cron: '0 20 * * 0'
 
@@ -43,3 +50,5 @@ jobs:
       docker_build_image: ${{ needs.build-docker.outputs.docker_build_image }}
       slice-check: 'true'
       SLICE_TEST_MODE: insert_baseline
+      MANUALLY_PR_ID: ${{ inputs.PR_ID }}
+      MANUALLY_COMMIT_ID: ${{ inputs.COMMIT_ID }}
diff --git a/.github/workflows/_Linux-XPU.yml b/.github/workflows/_Linux-XPU.yml
index cef20d6123de01..0991952dc629f8 100644
--- a/.github/workflows/_Linux-XPU.yml
+++ b/.github/workflows/_Linux-XPU.yml
@@ -206,7 +206,7 @@ jobs:
           CCACHE_DIR: /root/.ccache
           CCACHE_MAXSIZE: 150G
           CCACHE_LIMIT_MULTIPLE: 0.8
-          IF_KUNLUN3: "OFF"
+          IF_KUNLUN3: "ON"
           GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           home_dir: ${{ github.workspace }}/../../../..
           FLAGS_use_stride_kernel: "0"
diff --git a/.github/workflows/_Slice.yml b/.github/workflows/_Slice.yml
index 8f74843fb313e9..bbc32719c36e95 100644
--- a/.github/workflows/_Slice.yml
+++ b/.github/workflows/_Slice.yml
@@ -20,6 +20,12 @@ on:
         type: string
         required: false
         default: 'paddle'
+      MANUALLY_PR_ID:
+        type: string
+        required: false
+      MANUALLY_COMMIT_ID:
+        type: string
+        required: false
 
 env:
   PR_ID: ${{ github.event.pull_request.number || '0' }}
@@ -47,6 +53,7 @@ jobs:
 
   slice:
     name: Slice test
+    needs: check-bypass
     if: ${{ inputs.can-skip != 'true' && needs.check-bypass.outputs.can-skip != 'true' }}
     runs-on:
       group: slice
@@ -105,7 +112,11 @@ jobs:
           if [[ "${{ inputs.SLICE_BENCHMARK_FRAMEWORKS }}" == "torch" ]];then
             python3.10 -m pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
           else
-            python3.10 -m pip install $wheel_link
+            if [[ "${{ inputs.MANUALLY_PR_ID }}" == "" ]]; then
+              python3.10 -m pip install $wheel_link
+            else
+              python3.10 -m pip install https://paddle-github-action.bj.bcebos.com/PR/build/${{ inputs.MANUALLY_PR_ID }}/${{ inputs.MANUALLY_COMMIT_ID }}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl
+            fi
           fi
           python3.10 -m pip install -r PaddleTest/framework/e2e/api_benchmark/requirement.txt
           cd PaddleTest/framework/slice_benchmark
diff --git a/ci/check_approval.sh b/ci/check_approval.sh
index 29fc804fa37452..f846d8a01d0f7d 100644
--- a/ci/check_approval.sh
+++ b/ci/check_approval.sh
@@ -309,6 +309,12 @@ if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${PR_ID}" != "" ]; then
     check_approval 1 luotao1 zhangbo9674 phlrain
 fi
 
+CHINESE_CHECK=$(git diff -U0 upstream/$BRANCH |grep "^+" |grep -P '[\p{Han}]')
+if [ "${CHINESE_CHECK}" != "" ] && [ "${PR_ID}" != "" ]; then
+	echo_line="Not recommended to use Chinese. You must have one RD (tianshuo78520a or swgu98 or zhangbo9674 or risemeup1) approval."
+    check_approval 1 tianshuo78520a swgu98 zhangbo9674 risemeup1
+fi
+
 ALL_ADDED_LINES=$(git diff -U0 upstream/$BRANCH |grep "^+" || true)
 ALL_PADDLE_CHECK=$(echo $ALL_ADDED_LINES |grep -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" || true)
 VALID_PADDLE_CHECK=$(echo "$ALL_PADDLE_CHECK" | grep -zoE '(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(([^,;]+,)*[^";]*errors::.[^"]*".[^";]{20,}.[^;]*\);\s' || true)
diff --git a/ci/kunlun_test.sh b/ci/kunlun_test.sh
index e782a4b9787850..817099f8a342d7 100644
--- a/ci/kunlun_test.sh
+++ b/ci/kunlun_test.sh
@@ -159,9 +159,10 @@ set +x
             git clone --depth 1000 https://gitee.com/paddlepaddle/PaddleX.git
             cd PaddleX
             pip install -e .
+            pip install numpy==1.24.4 pypdfium2
 
             #install paddle x dependency
-            paddlex --install PaddleClas
+            paddlex --install PaddleClas -y
 
             #download paddle dataset
             wget -q https://paddle-model-ecology.bj.bcebos.com/paddlex/data/cls_flowers_examples.tar -P ./dataset
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index aaa86f50aa8faf..6cf2ffe32881a1 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -26,12 +26,21 @@ else()
 endif()
 
 if(NOT DEFINED ENV{runtime_include_dir})
-  message(
-    STATUS
-      "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
-  set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
-  add_definitions(
-    -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  if(WITH_GPU)
+    message(
+      STATUS
+        "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+    set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+    add_definitions(
+      -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  elseif(WITH_ROCM)
+    message(
+      STATUS
+        "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip")
+    set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip")
+    add_definitions(
+      -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/hip")
+  endif()
 endif()
 
 if(WITH_TESTING)
@@ -118,6 +127,10 @@ if(WITH_ROCM)
     add_definitions(-DCINN_WITH_HIP)
   endif()
   link_libraries(${ROCM_HIPRTC_LIB})
+
+  message(
+    STATUS "copy paddle/cinn/common/float16.h to $ENV{runtime_include_dir}")
+  file(COPY paddle/cinn/common/float16.h DESTINATION $ENV{runtime_include_dir})
 endif()
 
 set(cinnapi_src CACHE INTERNAL "" FORCE)
diff --git a/doc/README_cn.md b/doc/README_cn.md
new file mode 100644
index 00000000000000..cb643ee6e9ac02
--- /dev/null
+++ b/doc/README_cn.md
@@ -0,0 +1,6 @@
+# 致读者和开发者
+感谢您阅读 PaddlePaddle 文档。
+
+自 **2018年9月17日** 起，**0.15.0 及 develop** 分支的文档源码已迁移至 [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) 仓库 ，并将在该仓库中持续更新。
+
+请前往 FluidDoc 仓库获取最新文档。
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
index 3ebf18825a7b0a..369da2ba855b9e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.cc
@@ -742,9 +742,7 @@ class SplitOpPattern : public pir::OpRewritePattern<paddle::dialect::SplitOp> {
   using pir::OpRewritePattern<paddle::dialect::SplitOp>::OpRewritePattern;
 
   bool Match(paddle::dialect::SplitOp op) const override {
-    const bool is_denied = CompatibleInfo::IsDeniedForCinn(*op.operation());
-
-    return !is_denied && PatternConstraint(op);
+    return PatternConstraint(op);
   }
 
   void Rewrite(paddle::dialect::SplitOp op,
diff --git a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index cbfe4e05c09ad9..10fa55bb051c6b 100644
--- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -338,7 +338,7 @@ extern "C" {
 
 __device__ inline int FN_INT32(pow)(int a, int b) {
   if (a == 0 && b < 0) {
-    return -1;
+    return 0;
   }
   float res = pow(__int2float_rd(a), __int2float_rd(b));
   return __float2int_rn(res);
@@ -418,6 +418,9 @@ __device__ inline long long int FN_INT64(exp)(long long int a) {
 
 __device__ inline long long int FN_INT64(pow)(long long int a,
                                               long long int b) {
+  if (a == 0 && b < 0) {
+    return 0;
+  }
   double res = pow(__ll2double_rd(a), __ll2double_rd(b));
   return __double2ll_rn(res);
 }
diff --git a/paddle/common/layout.h b/paddle/common/layout.h
index 4c2fb90794eb52..016c8b828c72e5 100644
--- a/paddle/common/layout.h
+++ b/paddle/common/layout.h
@@ -85,6 +85,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kAnyLayout;
   } else if (s == "MKLDNNLAYOUT") {
     return DataLayout::kMKLDNN;
+  } else if (s == "ONEDNNLAYOUT") {
+    return DataLayout::ONEDNN;
   } else if (s == "SPARSE_COO") {
     return DataLayout::SPARSE_COO;
   } else if (s == "SPARSE_CSR") {
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 296989f7d612a2..3455922b3066eb 100755
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -321,7 +321,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
         continue;
       }
     } else if (pass->Type() == "onednn_placement_pass") {
-      pass->Set("mkldnn_enabled_op_types",
+      pass->Set("onednn_enabled_op_types",
                 new std::unordered_set<std::string>(onednn_enabled_op_types_));
     }
     VLOG(1) << "Start Apply Pass " << pass->Type();
diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
index 62748541683476..d1c73836ac90d9 100644
--- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.cc
@@ -64,7 +64,7 @@ inline bool FoundPhiOneDNNKernelWithCorrectDataType(
   return false;
 }
 
-bool MKLDNNPlacementPass::IsSupport(const Node* op) const {
+bool ONEDNNPlacementPass::IsSupport(const Node* op) const {
   if (FoundOneDNNKernelWithCorrectDataType(op) ||
       FoundPhiOneDNNKernelWithCorrectDataType(op)) {
     // For interpolate ops, there's a little difference between Paddle and
@@ -89,8 +89,8 @@ bool MKLDNNPlacementPass::IsSupport(const Node* op) const {
 
 }  // namespace paddle::framework::ir
 
-REGISTER_PASS(onednn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass)
-    .RequirePassAttr("mkldnn_enabled_op_types");
+REGISTER_PASS(onednn_placement_pass, paddle::framework::ir::ONEDNNPlacementPass)
+    .RequirePassAttr("onednn_enabled_op_types");
 
 REGISTER_PASS_CAPABILITY(onednn_placement_pass)
     .AddCombination(
diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h
index 5fc1dbd24f18ef..b7e0e1d3383c69 100644
--- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h
+++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass.h
@@ -26,17 +26,17 @@ namespace ir {
 /*
  * Specifies which operators should use MKLDNN.
  */
-class MKLDNNPlacementPass : public PlacementPassBase {
+class ONEDNNPlacementPass : public PlacementPassBase {
  protected:
   bool IsSupport(const Node* op) const override;
 
  private:
-  const std::string GetPlacementName() const override { return "MKLDNN"; }
+  const std::string GetPlacementName() const override { return "ONEDNN"; }
 
   const std::string GetAttrName() const override { return "use_mkldnn"; }
 
   const std::unordered_set<std::string> GetOpTypesList() const override {
-    return Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
+    return Get<std::unordered_set<std::string>>("onednn_enabled_op_types");
   }
 };
 
diff --git a/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
index c8346dcbafd7a0..81f4ca871d550a 100644
--- a/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/onednn/onednn_placement_pass_tester.cc
@@ -133,7 +133,7 @@ class PlacementPassTest {
 
     auto pass = PassRegistry::Instance().Get("onednn_placement_pass");
 
-    pass->Set("mkldnn_enabled_op_types",
+    pass->Set("onednn_enabled_op_types",
               new std::unordered_set<std::string>(onednn_enabled_op_types));
 
     graph.reset(pass->Apply(graph.release()));
@@ -143,8 +143,10 @@ class PlacementPassTest {
     for (auto* node : graph->Nodes()) {
       if (node->IsOp()) {
         auto* op = node->Op();
-        if (op->HasAttr("use_mkldnn") &&
-            PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn"))) {
+        if ((op->HasAttr("use_mkldnn") &&
+             PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn"))) ||
+            (op->HasAttr("use_onednn") &&
+             PADDLE_GET_CONST(bool, op->GetAttr("use_onednn")))) {
           ++use_onednn_true_count;
         }
       }
@@ -156,27 +158,27 @@ class PlacementPassTest {
   void PlacementNameTest() {
     auto pass = PassRegistry::Instance().Get("onednn_placement_pass");
     EXPECT_EQ(static_cast<PlacementPassBase*>(pass.get())->GetPlacementName(),
-              "MKLDNN");
+              "ONEDNN");
   }
 };
 
-TEST(MKLDNNPlacementPass, enable_conv_relu) {
+TEST(ONEDNNPlacementPass, enable_conv_relu) {
   // 2 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool
   PlacementPassTest().MainTest({"conv2d", "relu"}, 4);
 }
 
-TEST(MKLDNNPlacementPass, enable_relu_pool) {
+TEST(ONEDNNPlacementPass, enable_relu_pool) {
   // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool
   PlacementPassTest().MainTest({"relu", "pool2d"}, 4);
 }
 
-TEST(MKLDNNPlacementPass, enable_all) {
+TEST(ONEDNNPlacementPass, enable_all) {
   // 2 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool +
   // 1 concat
   PlacementPassTest().MainTest({}, 6);
 }
 
-TEST(MKLDNNPlacementPass, placement_name) {
+TEST(ONEDNNPlacementPass, placement_name) {
   PlacementPassTest().PlacementNameTest();
 }
 
diff --git a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
index 0f3f0c2411f2c3..2863be568ae68c 100644
--- a/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/control_flow/if_instruction.cc
@@ -119,7 +119,7 @@ IfInstruction::IfInstruction(size_t id,
       outputs.emplace(value, GetValueIds(value, *value_exec_info));
     }
     if (value.use_count() > 0) {
-      VLOG(6) << "value " << i << " use conutn != 0";
+      VLOG(6) << "value " << i << " use count != 0";
       is_last_op = false;
     }
   }
diff --git a/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc
index 3105b6d09e3839..bc8fd95bf0da5c 100644
--- a/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cuda_graph_instruction.cc
@@ -109,7 +109,7 @@ CudaGraphInstruction::CudaGraphInstruction(
       outputs.emplace(value, GetValueIds(value, *value_exec_info));
     }
     if (value.use_count() > 0) {
-      VLOG(6) << "value " << i << " use conutn != 0";
+      VLOG(6) << "value " << i << " use count != 0";
       is_last_op = false;
     }
   }
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 198ff8dcd8ccc3..95121b1d223312 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -170,7 +170,7 @@ inline void RegisterKernelClass(const char* op_type,
   std::string library(library_type);
   std::string data_layout = "ANYLAYOUT";
   if (library == "MKLDNN") {
-    data_layout = "MKLDNNLAYOUT";
+    data_layout = "ONEDNNLAYOUT";
   }
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (std::is_same<PlaceType, phi::CustomPlace>::value) {
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 5e11ce0e3f47cb..6b2b38feebef02 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -193,12 +193,12 @@ struct Argument {
   // whether to mute all logs in inference.
   DECL_ARGUMENT_FIELD(disable_logs, DisableLogs, bool);
 
-  // Pass a set of op types to enable its mkldnn kernel
-  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types,
-                      MKLDNNEnabledOpTypes,
+  // Pass a set of op types to enable its onednn kernel
+  DECL_ARGUMENT_FIELD(onednn_enabled_op_types,
+                      ONEDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
-  // The cache capacity of different input shapes for mkldnn.
-  DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int);
+  // The cache capacity of different input shapes for onednn.
+  DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, OnednnCacheCapacity, int);
 
 #ifdef PADDLE_WITH_DNNL
   // A set of op types to enable their quantized kernels
@@ -219,7 +219,7 @@ struct Argument {
                       Bfloat16EnabledOpTypes,
                       std::unordered_set<std::string>);
 
-  DECL_ARGUMENT_FIELD(use_onednn_int8, UseMkldnnInt8, bool);
+  DECL_ARGUMENT_FIELD(use_onednn_int8, UseOnednnInt8, bool);
 #endif
 
   // Passed from config.
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 6048d8b4944477..c416926df5dfdd 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -131,9 +131,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir)));
       pass_num++;
     } else if (pass_name == "onednn_placement_pass") {
-      pass->Set("mkldnn_enabled_op_types",
+      pass->Set("onednn_enabled_op_types",
                 new std::unordered_set<std::string>(
-                    argument->mkldnn_enabled_op_types()));
+                    argument->onednn_enabled_op_types()));
     } else if (pass_name == "cudnn_placement_pass") {
       pass->Set("cudnn_enabled_op_types",
                 new std::unordered_set<std::string>());
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fcee93efdb61e9..4f1d59f4b64d94 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1031,8 +1031,8 @@ void AnalysisPredictor::OptimizeInferencePirProgram() {
       }
 #endif
 #ifdef PADDLE_WITH_DNNL
-    } else if (config_.mkldnn_enabled()) {
-      // mkldnn
+    } else if (config_.onednn_enabled()) {
+      // onednn
       pir::IrContext *ctx = pir::IrContext::Instance();
       ctx->GetOrRegisterDialect<paddle::dialect::OneDNNOperatorDialect>();
       if (!config_.custom_pass_only_) {
@@ -2100,9 +2100,9 @@ void AnalysisPredictor::PrepareArgument() {
   argument_->SetIpuCustomPatterns(config_.ipu_custom_patterns_);
 #endif
 
-  if (config_.mkldnn_enabled() && !config_.use_gpu()) {
-    LOG(INFO) << "MKLDNN is enabled";
-    argument_->SetMKLDNNEnabledOpTypes(config_.onednn_enabled_op_types_);
+  if (config_.onednn_enabled() && !config_.use_gpu()) {
+    LOG(INFO) << "ONEDNN is enabled";
+    argument_->SetONEDNNEnabledOpTypes(config_.onednn_enabled_op_types_);
   }
 
   if (config_.cinn_enabled()) {
@@ -2115,7 +2115,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetBfloat16EnabledOpTypes(config_.bfloat16_enabled_op_types_);
   }
 
-  if (config_.mkldnn_int8_enabled()) {
+  if (config_.onednn_int8_enabled()) {
     LOG(INFO) << "Int8 is enabled";
     argument_->SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_);
     argument_->SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_);
@@ -2296,7 +2296,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 #if defined(_WIN32)
   argument_->PartiallyRelease();
 #else
-  if (config_.mkldnn_enabled() ||
+  if (config_.onednn_enabled() ||
       (config_.tensorrt_engine_enabled() &&
        config_.tensorrt_precision_mode_ ==
            AnalysisConfig::Precision::kInt8)) {  // NOLINT
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 96b0b49915d5d0..b19a33e5eadfd9 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -311,7 +311,7 @@ bool PD_OnednnEnabled(const PD_AnalysisConfig* config) {
       config,
       common::errors::InvalidArgument(
           "The pointer of analysis configuration shouldn't be nullptr"));
-  return config->config.mkldnn_enabled();
+  return config->config.onednn_enabled();
 }
 
 void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index 0bba3ebd2e554b..d1f341b504c965 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -53,7 +53,7 @@ void InterpreterEngine::CreateInterpreterCore() {
 #ifdef PADDLE_WITH_DNNL
   auto onednn_pass =
       framework::ir::PassRegistry::Instance().Get("onednn_placement_pass");
-  onednn_pass->Set("mkldnn_enabled_op_types",
+  onednn_pass->Set("onednn_enabled_op_types",
                    new std::unordered_set<std::string>({}));
   onednn_pass->Apply(&graph);
 #endif
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index 944d9f6bfca1e2..4089772637abf0 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -61,7 +61,7 @@ static bool ReduceOpHasOptimizedOneDNNKernel(
 }
 
 // only poolop
-bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) {
+bool CanONEDNNSupportPool(const framework::ExecutionContext& ctx) {
   if (ctx.Attr<bool>("adaptive") == false) return true;
   // oneDNN is supporting only unchangeable in size pool window
   auto src_tz = common::vectorize(ctx.Input<phi::DenseTensor>("X")->dims());
@@ -181,7 +181,7 @@ phi::KernelKey GetPoolExpectedKernelType(
   auto data_type = op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
   // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-  op_ptr->SetDnnFallback(!CanMKLDNNSupportPool(ctx));
+  op_ptr->SetDnnFallback(!CanONEDNNSupportPool(ctx));
   // NOTE(jiahongyu) END: Above codes originally enclosed by PADDLE_WITH_DNNL
 
   return phi::KernelKey(data_type, ctx.GetPlace());
@@ -194,7 +194,7 @@ phi::KernelKey GetPoolDoubleGradExpectedKernelType(
       op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "grad_x@GRAD");
 
   // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
-  op_ptr->SetDnnFallback(!CanMKLDNNSupportPool(ctx));
+  op_ptr->SetDnnFallback(!CanONEDNNSupportPool(ctx));
   // NOTE(jiahongyu) END: Above codes originally enclosed by PADDLE_WITH_DNNL
 
   return phi::KernelKey(data_type, ctx.GetPlace());
diff --git a/paddle/fluid/pybind/compiled_program.cc b/paddle/fluid/pybind/compiled_program.cc
index 563ff805815fc7..18f36a2b2efe33 100644
--- a/paddle/fluid/pybind/compiled_program.cc
+++ b/paddle/fluid/pybind/compiled_program.cc
@@ -824,6 +824,15 @@ void BindCompiledProgram(pybind11::module &m) {  // NOLINT
              const std::unordered_set<std::string> &onednn_enabled_op_types) {
             self.onednn_enabled_op_types_ = onednn_enabled_op_types;
           })
+      .def_property(
+          "onednn_enabled_op_types",
+          [](const BuildStrategy &self) {
+            return self.onednn_enabled_op_types_;
+          },
+          [](BuildStrategy &self,
+             const std::unordered_set<std::string> &onednn_enabled_op_types) {
+            self.onednn_enabled_op_types_ = onednn_enabled_op_types;
+          })
       .def_property(
           "allow_cuda_graph_capture",
           [](const BuildStrategy &self) {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f4a373824b162b..8af90c243833d3 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1414,10 +1414,10 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self,
     if (self->tensor.is_dense_tensor()) {
       auto* dst_tensor =
           static_cast<phi::DenseTensor*>(self->tensor.impl().get());
-      if (self->tensor.has_allocation() &&
-              !dst_tensor->meta().is_contiguous() ||
-          !src_tensor->meta().is_contiguous()) {
-        VLOG(8) << "set_tensor() method , src or dst tensor is not contiguous";
+      if (self->tensor.has_allocation() && self->tensor.initialized() &&
+          (!dst_tensor->meta().is_contiguous() ||
+           !src_tensor->meta().is_contiguous())) {
+        VLOG(8) << "set_tensor() method , src or dst tensor is not contiguous ";
         if (!FLAGS_use_stride_kernel) {
           PADDLE_THROW(common::errors::Fatal(
               "FLAGS_use_stride_kernel is closed. Strided kernel "
@@ -1450,7 +1450,6 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self,
           "The `set_tensor()` method of non DenseTensor get a DenseTensor src "
           "value"));
     }
-
   } else if (value.is_dist_tensor()) {
 #ifdef PADDLE_WITH_DISTRIBUTE
     auto* src_tensor =
@@ -1484,7 +1483,6 @@ static PyObject* tensor_method_set_underline_tensor(TensorObject* self,
         "current PaddlePaddle, please recompile and installPaddlePaddle "
         "with the option of `WITH_DISTRIBUTE=ON`."));
 #endif
-
   } else {
     PADDLE_THROW(common::errors::Unavailable(
         "The `set_tensor()` method of (Dist)Tensor get a non "
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b48a5ba9f630b8..f090156d54d0c6 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -1050,26 +1050,35 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::SwitchIrDebug,
            py::arg("x") = true,
            py::arg("passes") = std::vector<std::string>())
-      .def("enable_mkldnn", &AnalysisConfig::EnableONEDNN)
-      .def("disable_mkldnn", &AnalysisConfig::DisableONEDNN)
-      .def("mkldnn_enabled", &AnalysisConfig::onednn_enabled)
+      .def("enable_mkldnn", &AnalysisConfig::EnableONEDNN)     // deprecated
+      .def("disable_mkldnn", &AnalysisConfig::DisableONEDNN)   // deprecated
+      .def("mkldnn_enabled", &AnalysisConfig::onednn_enabled)  // deprecated
+      .def("enable_onednn", &AnalysisConfig::EnableONEDNN)
+      .def("disable_onednn", &AnalysisConfig::DisableONEDNN)
+      .def("onednn_enabled", &AnalysisConfig::onednn_enabled)
       .def("enable_cinn", &AnalysisConfig::EnableCINN)
       .def("set_cpu_math_library_num_threads",
            &AnalysisConfig::SetCpuMathLibraryNumThreads)
       .def("cpu_math_library_num_threads",
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
-      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableOnednnBfloat16)
+      .def("enable_mkldnn_bfloat16",
+           &AnalysisConfig::EnableOnednnBfloat16)  // deprecated
+      .def("enable_onednn_bfloat16", &AnalysisConfig::EnableOnednnBfloat16)
 #ifdef PADDLE_WITH_DNNL
       .def("set_mkldnn_cache_capacity",
+           &AnalysisConfig::SetOnednnCacheCapacity,
+           py::arg("capacity") = 0)  // deprecated
+      .def("set_onednn_cache_capacity",
            &AnalysisConfig::SetOnednnCacheCapacity,
            py::arg("capacity") = 0)
       .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op)
       .def("enable_mkldnn_int8",
            &AnalysisConfig::EnableOnednnInt8,
            py::arg("mkldnn_int8_enabled_op_types") =
-               std::unordered_set<std::string>({}))
-      .def("mkldnn_int8_enabled", &AnalysisConfig::onednn_int8_enabled)
+               std::unordered_set<std::string>({}))  // deprecated
+      .def("mkldnn_int8_enabled",
+           &AnalysisConfig::onednn_int8_enabled)  // deprecated
       .def("disable_mkldnn_fc_passes",
            &AnalysisConfig::DisableOnednnFcPasses,
            R"DOC(
@@ -1085,9 +1094,31 @@ void BindAnalysisConfig(py::module *m) {
                     >>> config = Config("")
                     >>> config.enable_mkldnn()
                     >>> config.disable_mkldnn_fc_passes()
+            )DOC")  // deprecated
+      .def("enable_onednn_int8",
+           &AnalysisConfig::EnableOnednnInt8,
+           py::arg("onednn_int8_enabled_op_types") =
+               std::unordered_set<std::string>({}))
+      .def("onednn_int8_enabled", &AnalysisConfig::onednn_int8_enabled)
+      .def("disable_onednn_fc_passes",
+           &AnalysisConfig::DisableOnednnFcPasses,
+           R"DOC(
+            Disable Onednn FC
+            Returns:
+                None.
+
+            Examples:
+                .. code-block:: python
+
+                    >>> from paddle.inference import Config
+
+                    >>> config = Config("")
+                    >>> config.enable_onednn()
+                    >>> config.disable_onednn_fc_passes()
             )DOC")
 #endif
-      .def("set_mkldnn_op", &AnalysisConfig::SetONEDNNOp)
+      .def("set_mkldnn_op", &AnalysisConfig::SetONEDNNOp)  // deprecated
+      .def("set_onednn_op", &AnalysisConfig::SetONEDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
       .def("model_from_memory", &AnalysisConfig::model_from_memory)
       .def("delete_pass", &AnalysisConfig::DeletePass)
@@ -1329,23 +1360,32 @@ void BindPaddlePassBuilder(py::module *m) {
   py::class_<PassStrategy, PaddlePassBuilder>(*m, "PassStrategy")
       .def(py::init<const std::vector<std::string> &>())
       .def("enable_cudnn", &PassStrategy::EnableCUDNN)
-      .def("enable_mkldnn", &PassStrategy::EnableONEDNN)
-      .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
+      .def("enable_mkldnn", &PassStrategy::EnableONEDNN)  // deprecated
+      .def("enable_mkldnn_bfloat16",
+           &PassStrategy::EnableMkldnnBfloat16)  // deprecated
+      .def("enable_onednn", &PassStrategy::EnableONEDNN)
+      .def("enable_onednn_bfloat16", &PassStrategy::EnableOnednnBfloat16)
       .def("use_gpu", &PassStrategy::use_gpu);
 
   py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const CpuPassStrategy &>())
       .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
-      .def("enable_mkldnn", &CpuPassStrategy::EnableONEDNN)
-      .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
+      .def("enable_mkldnn", &CpuPassStrategy::EnableONEDNN)  // deprecated
+      .def("enable_mkldnn_bfloat16",
+           &CpuPassStrategy::EnableMkldnnBfloat16)  // deprecated
+      .def("enable_onednn", &CpuPassStrategy::EnableONEDNN)
+      .def("enable_onednn_bfloat16", &CpuPassStrategy::EnableOnednnBfloat16);
 
   py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const GpuPassStrategy &>())
       .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
-      .def("enable_mkldnn", &GpuPassStrategy::EnableONEDNN)
-      .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
+      .def("enable_mkldnn", &GpuPassStrategy::EnableONEDNN)  // deprecated
+      .def("enable_mkldnn_bfloat16",
+           &GpuPassStrategy::EnableMkldnnBfloat16)  // deprecated
+      .def("enable_onednn", &GpuPassStrategy::EnableONEDNN)
+      .def("enable_onednn_bfloat16", &GpuPassStrategy::EnableOnednnBfloat16);
 }
 
 void BindInternalUtils(py::module *m) {
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index e56e494160fe88..4be2fe7a31976d 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -525,7 +525,7 @@ static void ParseIndex(const paddle::Tensor& tensor,
         if (slice_tensor.dtype() == phi::DataType::BOOL) {
           // bool tensor consumes (rank of index tensor) dimensions of input
           // tensor
-          for (int i = 0; i < slice_tensor.shape().size(); i++) {
+          for (size_t i = 0; i < slice_tensor.shape().size(); i++) {
             PADDLE_ENFORCE_EQ(slice_tensor.shape()[i],
                               dim_len,
                               common::errors::OutOfRange(
@@ -684,7 +684,7 @@ static paddle::Tensor dealWithAdvancedIndex(
       if (index.dtype() == phi::DataType::BOOL) {
         *rank_of_new_dim = std::max(*rank_of_new_dim, 1);
         i--;
-        for (int j = 0; j < index.shape().size(); j++) {
+        for (size_t j = 0; j < index.shape().size(); j++) {
           i++;
           index_dim = (*advanced_index_dim)[i];
           trans_dim->push_back(index_dim);
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 52462bd182803f..b2c83177284486 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -903,7 +903,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              const auto &device_id =
                  paddle::platform::GetXPUCurrentDeviceId();
              auto stream = paddle::platform::get_current_stream(device_id);
-             xpu_wait(stream);
+             xpu_wait(stream->raw_stream());
              int type_idx = static_cast<int>(self.type());
              size_t data_size = self.numel() *
                  framework::SizeOfType(
diff --git a/paddle/fluid/pybind/xpu_streams_py.cc b/paddle/fluid/pybind/xpu_streams_py.cc
index 98a581e0768138..957746605007ab 100644
--- a/paddle/fluid/pybind/xpu_streams_py.cc
+++ b/paddle/fluid/pybind/xpu_streams_py.cc
@@ -33,19 +33,27 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id) {
-  if (device_id == -1) {
-    device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
-  }
+phi::XPUStreamHandle *get_current_stream(int device_id) {
   auto place = phi::XPUPlace(device_id);
   auto *dev_ctx = static_cast<phi::XPUContext *>(
       phi::DeviceContextPool::Instance().Get(place));
   dev_ctx->Wait();
-  return dev_ctx->stream();
+  return dev_ctx->get_current_stream_handle();
+}
+
+phi::XPUStreamHandle *set_current_stream(int idx) {
+  int device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
+  auto original_stream = get_current_stream(device_id);
+  auto place = phi::XPUPlace(device_id);
+  auto *dev_ctx = static_cast<phi::XPUContext *>(
+      phi::DeviceContextPool::Instance().Get(place));
+  dev_ctx->SetCurrentStream(idx);
+  return original_stream;
 }
 
 #endif
 }  // namespace platform
+
 namespace pybind {
 void BindXpuStream(py::module *m_ptr) {
   auto &m = *m_ptr;
@@ -69,7 +77,7 @@ void BindXpuStream(py::module *m_ptr) {
 #endif
   });
   m.def(
-      "_get_current_stream",
+      "_xpu_get_current_stream",
       [](int device_id) {
 #ifdef PADDLE_WITH_XPU
         if (device_id == -1) {
@@ -79,7 +87,19 @@ void BindXpuStream(py::module *m_ptr) {
         return platform::get_current_stream(device_id);
 #else
         PADDLE_THROW(
-            common::errors::Unavailable("Paddle is not compiled with CUDA. "
+            common::errors::Unavailable("Paddle is not compiled with XPU. "
+                                        "Cannot visit device synchronize."));
+#endif
+      },
+      py::return_value_policy::reference);
+  m.def(
+      "_xpu_set_current_stream",
+      [](int stream_id) {
+#ifdef PADDLE_WITH_XPU
+        return platform::set_current_stream(stream_id);
+#else
+        PADDLE_THROW(
+            common::errors::Unavailable("Paddle is not compiled with XPU. "
                                         "Cannot visit device synchronize."));
 #endif
       },
@@ -100,12 +120,167 @@ void BindXpuStream(py::module *m_ptr) {
 #endif
   });
 
+  py::class_<phi::XPUStreamHandle>(m, "XPUStream", R"DOC(
+      The handle of the XPU stream.
+
+      Parameters:
+          device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
+              If device is None or negative integer, device will be the current device.
+              If device is positive integer, it must less than the device count. Default: None.
+
+      Examples:
+          .. code-block:: python
+
+              >>> # doctest: +REQUIRES(env:XPU)
+              >>> import paddle
+              >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0))
+              >>> s2 = paddle.device.xpu.Stream(0)
+              >>> s3 = paddle.device.xpu.Stream()
+
+      )DOC")
+#ifdef PADDLE_WITH_XPU
+      .def_property_readonly(
+          "xpu_stream",
+          [](phi::XPUStreamHandle &self) {
+            return reinterpret_cast<std::uintptr_t>(self.raw_stream());
+          })
+      .def("wait_stream",
+           [](phi::XPUStreamHandle &self, phi::XPUStreamHandle &other) {
+             auto *dev_ctx = phi::get_xpu_context();
+             dev_ctx->StreamWaitStreamInPool(self.id(), other.id());
+           })
+      .def("wait_event",
+           [](phi::XPUStreamHandle &self, phi::XPUEventHandle &other) {
+             self.wait_event(other.get_event());
+           })
+      .def("query",
+           [](phi::XPUStreamHandle &self) {
+             PADDLE_THROW(common::errors::Unavailable(
+                 "Query function for XPUStream is not supported now"));
+           })
+      .def("record_event",
+           [](phi::XPUStreamHandle &self, phi::XPUEventHandle *event) {
+             if (event == nullptr) {
+               event = new phi::XPUEventHandle();
+             }
+             self.record_event(event->get_event());
+             return event;
+           })
+      .def(
+          "synchronize",
+          [](phi::XPUStreamHandle &self) { self.synchronize(); },
+          R"DOC(
+          Waits for stream tasks to complete.
+
+          Examples:
+              .. code-block:: python
+
+                  >>> # doctest: +REQUIRES(env:XPU)
+                  >>> import paddle
+                  >>> s = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1)
+                  >>> s.synchronize()
+
+          )DOC")
+      .def_property_readonly(
+          "place",
+          [](phi::XPUStreamHandle &self) {
+            return phi::XPUPlace(platform::GetXPUCurrentDeviceId());
+          })
+      .def_property_readonly(
+          "idx", [](phi::XPUStreamHandle &self) { return self.id(); })
+#endif
+
+      .def("__init__",
+           [](phi::XPUStreamHandle &self) {
+#ifdef PADDLE_WITH_XPU
+             new (&self) phi::XPUStreamHandle();
+             self.Init();
+#else
+            PADDLE_THROW(common::errors::Unavailable(
+                "Class XPUStream can only be initialized on the XPU "
+                "platform."));
+#endif
+           })
+      .def(
+          "__init__",
+          [](phi::XPUStreamHandle &self, phi::XPUPlace *place) {
+#ifdef PADDLE_WITH_XPU
+            if (place == nullptr) {
+              int curr_device_id = platform::GetXPUCurrentDeviceId();
+              auto place_tmp = phi::XPUPlace(curr_device_id);
+              new (&self) phi::XPUStreamHandle(place_tmp);
+            } else {
+              new (&self) phi::XPUStreamHandle(*place);
+            }
+#else
+            PADDLE_THROW(common::errors::Unavailable(
+                "Class XPUStream can only be initialized on the XPU "
+                "platform."));
+#endif
+          },
+          py::arg("device") = nullptr)
+      .def(
+          "__init__",
+          [](phi::XPUStreamHandle &self, int device) {
+#ifdef PADDLE_WITH_XPU
+            if (device < 0) {
+              device = platform::GetXPUCurrentDeviceId();
+            }
+            auto place_tmp = phi::XPUPlace(device);
+            new (&self) phi::XPUStreamHandle(place_tmp);
+#else
+            PADDLE_THROW(common::errors::Unavailable(
+                "Class XPUStream can only be initialized on the XPU "
+                "platform."));
+#endif
+          },
+          py::arg("device") = -1);
+  py::class_<phi::XPUEventHandle>(m, "XPUEvent", R"DOC(
+      The handle of the XPU event.
+
+      Examples:
+          .. code-block:: python
+
+              >>> # doctest: +REQUIRES(env:XPU)
+              >>> import paddle
+              >>> event = paddle.device.xpu.Event()
+
+      )DOC")
+#ifdef PADDLE_WITH_XPU
+      .def(
+          "record",
+          [](phi::XPUEventHandle &self, phi::XPUStreamHandle *stream) {
+            if (stream == nullptr) {
+              auto *dev_ctx = phi::get_xpu_context();
+              auto stream_handle = dev_ctx->get_current_stream_handle();
+              self.record(stream_handle->raw_stream());
+            } else {
+              self.record(stream->raw_stream());
+            }
+          },
+          py::arg("stream") = nullptr)
+      .def("query", [](phi::XPUEventHandle &self) { return self.query(); })
+      .def("elapsed_time",
+           [](phi::XPUEventHandle &self) {
+             PADDLE_THROW(common::errors::Unavailable(
+                 "XPUEvent elapsed_time is not supported now"));
+           })
+      .def("synchronize", [](phi::XPUEventHandle &self) { self.synchronize(); })
+#endif
+      .def("__init__", [](phi::XPUEventHandle &self) {
+#ifdef PADDLE_WITH_XPU
+        new (&self) phi::XPUEventHandle();
+#else
+            PADDLE_THROW(common::errors::Unavailable(
+                "Class XPUEvent can only be initialized on the XPU platform."));
+#endif
+      });
 #ifdef PADDLE_WITH_XPU
-  py::class_<XPUStream>(m, "XPUStream", R"DOC(
-      The handle of the CUDA stream.
+  py::class_<phi::XPUCUDAStream>(m, "XPUCUDAStream", R"DOC(
+      The handle of the XPU stream.
 
       Parameters:
-          device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+          device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
               If device is None or negative integer, device will be the current device.
               If device is positive integer, it must less than the device count. Default: None.
           priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
@@ -114,16 +289,16 @@ void BindXpuStream(py::module *m_ptr) {
       Examples:
           .. code-block:: python
 
-              >>> # doctest: +REQUIRES(env:GPU)
+              >>> # doctest: +REQUIRES(env:XPU)
               >>> import paddle
-              >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-              >>> s2 = paddle.device.cuda.Stream(0, 1)
-              >>> s3 = paddle.device.cuda.Stream()
+              >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1)
+              >>> s2 = paddle.device.xpu.Stream(0, 1)
+              >>> s3 = paddle.device.xpu.Stream()
 
       )DOC")
       .def(
           "synchronize",
-          [](XPUStream &self) { xpu_wait(self); },
+          [](phi::XPUCUDAStream &self) { self.Synchronize(); },
           R"DOC(
           Waits for stream tasks to complete.
 
@@ -135,7 +310,25 @@ void BindXpuStream(py::module *m_ptr) {
                   >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
                   >>> s.synchronize()
 
-          )DOC");
+          )DOC")
+      .def("__init__",
+           [](phi::XPUCUDAStream &self, phi::XPUPlace *place, int priority) {
+             if (priority != 1 && priority != 2) {
+               PADDLE_THROW(common::errors::InvalidArgument(
+                   "Priority should be 1(high) or 2(normal) "));
+             }
+             auto stream_flag =
+                 phi::XPUCUDAStream::StreamFlag::kStreamNonBlocking;
+             if (place == nullptr) {
+               int curr_device_id = platform::GetXPUCurrentDeviceId();
+               auto place_tmp = phi::XPUPlace(curr_device_id);
+               new (&self)
+                   phi::XPUCUDAStream(place_tmp, priority - 2, stream_flag);
+             } else {
+               new (&self)
+                   phi::XPUCUDAStream(*place, priority - 2, stream_flag);
+             }
+           });
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/xpu_streams_py.h b/paddle/fluid/pybind/xpu_streams_py.h
index a146cf6ba3419e..a1f56b879d1cd9 100644
--- a/paddle/fluid/pybind/xpu_streams_py.h
+++ b/paddle/fluid/pybind/xpu_streams_py.h
@@ -18,12 +18,16 @@
 #include "pybind11/stl.h"
 
 #ifdef PADDLE_WITH_XPU
+#include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/xpu_cuda_stream.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
+
 #else
 namespace phi {
 class XPUCUDAStream {};
+class XPUStreamHandle {};
+class XPUEventHandle {};
 }  // namespace phi
 #endif
 
@@ -32,7 +36,8 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* set_current_stream(int idx);
 #endif
 }  // namespace platform
 namespace pybind {
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index b9e919c52b11b2..93bed19b2bc29d 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -29,6 +29,11 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#ifdef PADDLE_WITH_XPU
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#endif
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/stream.h"
 #endif
@@ -434,6 +439,10 @@ class PADDLE_API Tensor final {
    * @return gpuStream_t
    */
   gpuStream_t stream() const;
+#elif defined(PADDLE_WITH_XPU)
+
+  void record_stream(XPUStream stream) const;
+
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
   /**
    * @brief Get the stream where the tensor is currently located
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 98c632a511cd74..0e6af802094e2d 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -40,6 +40,8 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
 
+#include "paddle/phi/core/memory/malloc.h"
+
 namespace paddle {
 
 using DeviceContextPool = experimental::DeviceContextPool;
@@ -397,6 +399,14 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const {
 
 const std::shared_ptr<phi::TensorBase> &Tensor::impl() const { return impl_; }
 
+#ifdef PADDLE_WITH_XPU
+
+void Tensor::record_stream(XPUStream stream) const {
+  paddle::memory::RecordStream(
+      std::dynamic_pointer_cast<phi::DenseTensor>(impl_)->Holder(), stream);
+}
+
+#endif
 void Tensor::set_impl(const std::shared_ptr<phi::TensorBase> &impl) {
   impl_ = impl;
 }
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 149cbc3b56beb3..2f7d54eaa05e00 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -198,7 +198,8 @@ void Tensor::copy_(const Tensor &src,
     return;
   }
 #endif
-    if(is_dense_tensor() && has_allocation() && src.is_dense_tensor()) {
+    if(is_dense_tensor() && has_allocation() &&
+      initialized() && src.is_dense_tensor()) {
       auto dst_tensor = static_cast<phi::DenseTensor*>(impl_.get());
       auto src_tensor = std::static_pointer_cast<phi::DenseTensor>(src.impl_);
       if(!dst_tensor->meta().is_contiguous() ||
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index 800aefdc91ffa4..1c5d26d5f548c4 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/backends/context_pool.h"
 
 #ifdef PADDLE_WITH_XPU
 #include <cuda.h>
@@ -100,8 +101,8 @@ struct XPUContext::Impl {
   }
 
   // Set external stream for context
-  void SetStream(void* stream) {
-    if (context_->xpu_stream != nullptr && stream_owned_) {
+  void SetStream(void* stream, bool clear = true) {
+    if (clear && context_->xpu_stream != nullptr && stream_owned_) {
       xpu_stream_destroy(context_->xpu_stream);
     }
     stream_owned_ = false;
@@ -343,7 +344,21 @@ XPUContext::XPUContext() : DeviceContext() {
   } else {
     impls_.push_back(std::make_unique<Impl>());
     impls_[0]->Init(get_gm_size(0), get_l3_size(0));
+    stream_pool.push_back(impls_[0]->context_->get_stream());
+    idle_stream_flags.push_back(false);
+    current_stream_handle =
+        XPUStreamHandle(impls_[0]->context_->get_stream(), 0);
+    if (std::getenv("XPU_DEFAULT_STREAM_NUMBER") != nullptr) {
+      int default_num_stream = atoi(std::getenv("XPU_DEFAULT_STREAM_NUMBER"));
+      for (int i = 0; i < default_num_stream; i++) {
+        XPUStream s;
+        PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s));
+        stream_pool.push_back(s);
+        idle_stream_flags.push_back(true);
+      }
+    }
   }
+  current_stream_idx = 0;
 }
 
 XPUContext::XPUContext(const XPUPlace& place, bool is_comm_context)
@@ -362,10 +377,18 @@ XPUContext::XPUContext(const XPUPlace& place, bool is_comm_context)
       impls_.push_back(std::make_unique<Impl>(place));
       impls_[i]->Init(get_gm_size(i), get_l3_size(i));
     }
+    stream_pool.push_back(impls_[0]->context_->get_stream());
+    idle_stream_flags.push_back(false);
   } else {
     impls_.push_back(std::make_unique<Impl>(place));
     impls_[0]->Init(get_gm_size(0), get_l3_size(0));
+    stream_pool.push_back(impls_[0]->context_->get_stream());
+    idle_stream_flags.push_back(false);
+    current_stream_handle =
+        XPUStreamHandle(impls_[0]->context_->get_stream(), 0);
   }
+
+  current_stream_idx = 0;
 }
 
 XPUContext::~XPUContext() = default;
@@ -380,6 +403,9 @@ XPUStream XPUContext::stream(int i) const {
 void XPUContext::SetStream(void* stream, int i) {
   CheckValidStreamId(i);
   impls_[i]->SetStream(stream);
+  if (i == 0) {
+    current_stream_handle.set_stream(static_cast<XPUStream>(stream));
+  }
 }
 
 void XPUContext::CheckValidStreamId(int i) const {
@@ -397,6 +423,21 @@ void XPUContext::CheckValidStreamId(int i) const {
                               i));
 }
 
+void XPUContext::CheckValidIdxInRange(int i, int i_max) const {
+  PADDLE_ENFORCE_GE(
+      i,
+      0,
+      errors::InvalidArgument(
+          "The stream index must be greater than or equal to 0."));
+  PADDLE_ENFORCE_LT(
+      i,
+      i_max,
+      errors::InvalidArgument("The stream index should be less than the number "
+                              "of stream used (%d), but got %d",
+                              i_max,
+                              i));
+}
+
 void XPUContext::SetXpuVersion(int version) {
   impls_[0]->xpu_version_ = static_cast<backends::xpu::XPUVersion>(version);
 }
@@ -462,26 +503,251 @@ void XPUContext::StreamWaitEvent(XPUEvent event, int s) const {
 void XPUContext::StreamWaitStream(int wait_stream, int record_stream) const {
   CheckValidStreamId(wait_stream);
   CheckValidStreamId(record_stream);
-  XPUEvent event;
-  int r = xpu_event_create(&event);
-  PADDLE_ENFORCE_XRE_SUCCESS(r);
+  XPUEvent event = XPUEventPool::Instance().CreateEventFromPool();
   RecordEvent(event, record_stream);
   StreamWaitEvent(event, wait_stream);
-  r = xpu_event_destroy(event);
-  PADDLE_ENFORCE_XRE_SUCCESS(r);
-
   impls_[record_stream]->ClearStashedMemory();
 }
 
 int64_t XPUContext::GetStreamNum() const { return impls_.size(); }
 
+int XPUContext::SetCurrentStream(int idx) {
+  int prev_stream_idx = current_stream_idx;
+  if (prev_stream_idx != idx) {
+    impls_[0]->SetStream(stream_pool[idx]);
+    current_stream_handle.set_stream(stream_pool[idx]);
+    current_stream_idx = idx;
+    idle_stream_flags[prev_stream_idx] = true;
+    idle_stream_flags[current_stream_idx] = false;
+  }
+  return prev_stream_idx;
+}
+
+void XPUContext::StreamWaitStreamInPool(int wait_stream,
+                                        int record_stream) const {
+  CheckValidIdxInRange(wait_stream, stream_pool.size());
+  CheckValidIdxInRange(record_stream, stream_pool.size());
+  XPUEvent event = XPUEventPool::Instance().CreateEventFromPool();
+  int r = xpu_event_record(event, stream_pool[record_stream]);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+  r = xpu_stream_wait_event(stream_pool[wait_stream], event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+void XPUContext::StreamWaitEventInPool(int wait_stream, XPUEvent event) const {
+  CheckValidIdxInRange(wait_stream, stream_pool.size());
+  int r = xpu_stream_wait_event(stream_pool[wait_stream], event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+int XPUContext::get_idle_stream() {
+  bool found_idle_stream = false;
+  int stream_idx = 0;
+  int num_streams = idle_stream_flags.size();
+  for (; stream_idx < num_streams; stream_idx++) {
+    if (idle_stream_flags[stream_idx]) {
+      found_idle_stream = true;
+      break;
+    }
+  }
+  if (found_idle_stream) {
+    idle_stream_flags[stream_idx] = false;
+    return stream_idx;
+  } else {
+    add_stream_to_pool();
+    return stream_pool.size() - 1;
+  }
+}
+
+void XPUContext::add_stream_to_pool() {
+  XPUStream s;
+  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s));
+  stream_pool.push_back(s);
+  idle_stream_flags.push_back(false);
+}
+
+XPUStream XPUContext::get_stream_from_pool(int idx) const {
+  PADDLE_ENFORCE_GE(
+      idx,
+      0,
+      errors::InvalidArgument(
+          "The stream index must be greater than or equal to 0."));
+  PADDLE_ENFORCE_LT(
+      idx,
+      stream_pool.size(),
+      errors::InvalidArgument("The stream index should be less than the number "
+                              "of stream used (%d), but got %d",
+                              stream_pool.size(),
+                              idx));
+  return stream_pool[idx];
+}
+
+int XPUContext::get_current_stream_idx() { return current_stream_idx; }
 void XPUContext::AddStashedMemory(int stream, const DenseTensor& tensor) {
   CheckValidStreamId(stream);
   impls_[stream]->AddStashedMemory(tensor);
 }
 
+XPUStream XPUContext::get_current_stream() { return impls_[0]->stream(); }
+
+XPUStreamHandle* XPUContext::get_current_stream_handle() {
+  if (impls_[0]->context_->get_stream() == nullptr) {
+    XPUStream s;
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&s));
+    impls_[0]->SetStream(s);
+    stream_pool[current_stream_idx] = s;
+    current_stream_handle.set_stream(s);
+  }
+  return &current_stream_handle;
+}
+
 void XPUContext::Init() { impls_[0]->Init(); }
 
+XPUContext* get_xpu_context(int device_id) {
+  auto place_tmp = phi::XPUPlace(
+      device_id > -1 ? device_id : phi::backends::xpu::GetXPUCurrentDeviceId());
+  phi::XPUContext* dev_ctx = static_cast<phi::XPUContext*>(
+      phi::DeviceContextPool::Instance().Get(place_tmp));
+
+  return dev_ctx;
+}
+
+XPUStreamHandle::XPUStreamHandle() {}
+
+XPUStreamHandle::XPUStreamHandle(const int idx) {
+  auto* dev_ctx = phi::get_xpu_context();
+  stream_id = idx;
+  stream = dev_ctx->get_stream_from_pool(stream_id);
+}
+
+XPUStreamHandle::XPUStreamHandle(const phi::XPUPlace& place) {
+  phi::XPUContext* dev_ctx = static_cast<phi::XPUContext*>(
+      phi::DeviceContextPool::Instance().Get(place));
+  stream_id = dev_ctx->get_idle_stream();
+  stream = dev_ctx->get_stream_from_pool(stream_id);
+}
+
+XPUStreamHandle::XPUStreamHandle(const XPUStream xpu_stream, const int id) {
+  stream = xpu_stream;
+  stream_id = id;
+}
+
+void XPUStreamHandle::Init() {
+  auto* dev_ctx = phi::get_xpu_context();
+  stream_id = dev_ctx->get_idle_stream();
+  stream = dev_ctx->get_stream_from_pool(stream_id);
+}
+
+void XPUStreamHandle::wait_event(XPUEvent event) const {
+  int r = xpu_stream_wait_event(stream, event);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+void XPUStreamHandle::synchronize() const {
+  int r = xpu_wait(stream);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+void XPUStreamHandle::set_stream(XPUStream stream_) { stream = stream_; }
+
+void XPUStreamHandle::record_event(XPUEvent event) const {
+  int r = xpu_event_record(event, stream);
+  PADDLE_ENFORCE_XRE_SUCCESS(r);
+}
+
+XPUStreamHandle get_current_stream_handle(int device_id) {
+  auto* dev_ctx = get_xpu_context(device_id);
+  return *dev_ctx->get_current_stream_handle();
+}
+
+XPUStreamHandle get_stream_handle(int device_id) {
+  auto* dev_ctx = get_xpu_context(device_id);
+  return XPUStreamHandle(dev_ctx->get_idle_stream());
+}
+
+void set_current_stream(XPUStreamHandle* s) {
+  auto* dev_ctx = get_xpu_context();
+  dev_ctx->SetStream(s->raw_stream(), 0);
+}
+
+XPUEventPool& XPUEventPool::Instance() {
+  static XPUEventPool pool;
+  return pool;
+}
+
+XPUEventPool::~XPUEventPool() {
+  const auto& DestroyEvent = [](XPUEvent event) {
+    int r = xpu_event_destroy(event);
+    PADDLE_ENFORCE_XRE_SUCCESS(r);
+  };
+  const auto& CheckComplishAndDestroy = [&](XPUEvent event) -> bool {
+    if (xpu_event_query(event) == XPU_SUCCESS) {
+      DestroyEvent(event);
+      return true;
+    } else {
+      return false;
+    }
+  };
+  std::unique_lock<std::mutex> lock(mtx_);
+  while (!incomplished_events_.empty()) {
+    XPUEvent event = incomplished_events_.front();
+    if (!CheckComplishAndDestroy(event)) {
+      LOG(ERROR) << "failed on destroying event when destroying event pool.";
+    }
+    incomplished_events_.pop();
+  }
+}
+
+XPUEvent XPUEventPool::CreateEventFromPool() {
+  std::unique_lock<std::mutex> lock(mtx_);
+
+  const auto& CreateNewEvent = [&]() -> XPUEvent {
+    XPUEvent new_event;
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_create(&new_event));
+    incomplished_events_.push(new_event);
+    return new_event;
+  };
+
+  const auto& CreateNewOrReuseEvent = [&]() -> XPUEvent {
+    XPUEvent front_event = incomplished_events_.front();
+    incomplished_events_.pop();
+    incomplished_events_.push(front_event);
+    if (xpu_event_query(front_event) == XPU_SUCCESS) {
+      return front_event;
+    }
+    return CreateNewEvent();
+  };
+
+  if (incomplished_events_.empty()) {
+    return CreateNewEvent();
+  }
+  return CreateNewOrReuseEvent();
+}
+
+XPUEventHandle::XPUEventHandle() {
+  event_ = XPUEventPool::Instance().CreateEventFromPool();
+}
+XPUEventHandle::XPUEventHandle(XPUStream stream) {
+  event_ = XPUEventPool::Instance().CreateEventFromPool();
+  PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(event_, stream));
+}
+
+void XPUEventHandle::record(XPUStream stream) {
+  PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_query(event_));
+  PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(event_, stream));
+}
+
+bool XPUEventHandle::query() {
+  int result = xpu_event_query(event_);
+  if (result == XPU_SUCCESS) {
+    return true;
+  }
+  return false;
+}
+
+void XPUEventHandle::synchronize() {
+  PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_wait(event_));
+}
 #if defined(PADDLE_WITH_XPU)
 XPUPinnedContext::XPUPinnedContext() {
   eigen_device_ = std::make_unique<Eigen::DefaultDevice>();
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 2a9823d6a8de88..daa4cdd05e3d69 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -17,14 +17,14 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include <memory>
+#include <mutex>
+#include <queue>
 #include <vector>
-
 #include "paddle/phi/backends/xpu/forwards.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
-
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/core/xpu_cuda_stream.h"
 #endif
@@ -39,6 +39,26 @@ namespace phi {
 
 #ifdef PADDLE_WITH_XPU
 class XPUCUDAStream;
+class XPUStreamHandle {
+ public:
+  XPUStreamHandle();
+  explicit XPUStreamHandle(const int idx);
+  explicit XPUStreamHandle(const XPUPlace& place);
+  explicit XPUStreamHandle(const XPUStream xpu_stream, const int id);
+
+  void Init();
+
+  int id() const { return stream_id; }
+  XPUStream raw_stream() const { return stream; }
+  void wait_event(XPUEvent event) const;
+  void synchronize() const;
+  void record_event(XPUEvent event) const;
+  void set_stream(XPUStream stream);
+
+ private:
+  XPUStream stream;
+  int stream_id;
+};
 #endif
 
 class DenseTensor;
@@ -110,16 +130,65 @@ class XPUContext : public DeviceContext,
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
 
   XPUStream stream(int i = 0) const;
-
+  XPUStream get_stream_from_pool(int i = 0) const;
+  XPUStream get_current_stream();
   static const char* name() { return "XPUContext"; }
+  int SetCurrentStream(int idx);
+  void StreamWaitStreamInPool(int wait_stream, int record_stream) const;
+  void StreamWaitEventInPool(int wait_stream, XPUEvent event) const;
+  int get_idle_stream();
+  int get_current_stream_idx();
+  XPUStreamHandle* get_current_stream_handle();
 
  private:
   struct Impl;
+  XPUStreamHandle current_stream_handle;
   std::vector<std::unique_ptr<Impl>> impls_;
+  std::vector<bool> idle_stream_flags;
+  std::vector<XPUStream> stream_pool;
+  int current_stream_idx;
+  void add_stream_to_pool();
+  int get_stream_pool_size() const { return stream_pool.size(); }
 
   void CheckValidStreamId(int i) const;
+  void CheckValidIdxInRange(int idx, int range) const;
+};
+
+XPUContext* get_xpu_context(int device_id = -1);
+
+class XPUEventPool {
+ public:
+  XPUEventPool() = default;
+  XPUEventPool(const XPUEventPool&) = delete;
+  XPUEventPool(XPUEventPool&&) = delete;
+  ~XPUEventPool();
+
+  XPUEvent CreateEventFromPool();
+
+  static XPUEventPool& Instance();
+
+ private:
+  std::queue<XPUEvent> incomplished_events_;
+  std::mutex mtx_;
 };
 
+class XPUEventHandle {
+ public:
+  XPUEventHandle();
+  explicit XPUEventHandle(XPUStream stream);
+  void record(XPUStream stream);
+  bool query();
+  void synchronize();
+  XPUEvent get_event() const { return event_; }
+
+ private:
+  XPUEvent event_;
+};
+
+XPUStreamHandle get_current_stream_handle(int device_id = -1);
+XPUStreamHandle get_stream_handle(int device_id = -1);
+void set_current_stream(XPUStreamHandle* s);
+
 // KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc
index 65daaa257c2c5f..5123c6b33b6685 100644
--- a/paddle/phi/core/memory/allocation/allocator_facade.cc
+++ b/paddle/phi/core/memory/allocation/allocator_facade.cc
@@ -1940,6 +1940,14 @@ void AllocatorFacade::SetDefaultStream(const phi::XPUPlace& place,
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+
+bool AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
+                                   XPUStream stream) {
+  return GetPrivate()->RecordStream(allocation, stream);
+}
+#endif
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 uint64_t AllocatorFacade::Release(const phi::CustomPlace& place,
                                   phi::stream::stream_t stream) {
diff --git a/paddle/phi/core/memory/allocation/allocator_facade.h b/paddle/phi/core/memory/allocation/allocator_facade.h
index e46a6f9b13ef52..4b24dfcf57af4a 100644
--- a/paddle/phi/core/memory/allocation/allocator_facade.h
+++ b/paddle/phi/core/memory/allocation/allocator_facade.h
@@ -97,6 +97,7 @@ class AllocatorFacade {
 #elif defined(PADDLE_WITH_XPU)
   TEST_API const std::shared_ptr<Allocator>& GetAllocator(
       const phi::Place& place, XPUStream stream);
+  bool RecordStream(std::shared_ptr<Allocation> allocation, XPUStream stream);
   void SetDefaultStream(const phi::XPUPlace& place, XPUStream stream);
 #endif
 
diff --git a/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc b/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc
index fd30d61d47593e..8cd5471eb0e014 100644
--- a/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc
+++ b/paddle/phi/core/memory/allocation/stream_safe_xpu_allocator.cc
@@ -38,6 +38,10 @@ StreamSafeXPUAllocation::StreamSafeXPUAllocation(
 bool StreamSafeXPUAllocation::RecordStream(XPUStream stream) {
   VLOG(8) << "Try record stream " << stream << " for address " << ptr();
   if (stream == owning_stream_) {
+    VLOG(8) << "stream " << stream << " is the same as owning stream "
+            << owning_stream_;
+    VLOG(8) << "Skip recording the same stream " << stream << " for address "
+            << ptr();
     return false;
   }
 
@@ -57,9 +61,13 @@ bool StreamSafeXPUAllocation::CanBeFreed() {
        it != outstanding_event_map_.end();
        ++it) {
     XPUEvent& event = it->second;
-
-    PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_destroy(event));
-    VLOG(8) << "Destroy event " << event;
+    if (xpu_event_query(event) == XPU_SUCCESS) {
+      PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_destroy(event));
+      VLOG(8) << "Destroy event " << event;
+    } else {
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
   }
   return true;
 }
diff --git a/paddle/phi/core/memory/malloc.cc b/paddle/phi/core/memory/malloc.cc
index 050a3d2855189b..304a835a5b1b71 100644
--- a/paddle/phi/core/memory/malloc.cc
+++ b/paddle/phi/core/memory/malloc.cc
@@ -76,6 +76,13 @@ gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
 
 #endif
 
+#ifdef PADDLE_WITH_XPU
+bool RecordStream(std::shared_ptr<Allocation> allocation, XPUStream stream) {
+  return allocation::AllocatorFacade::Instance().RecordStream(allocation,
+                                                              stream);
+}
+#endif
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 uint64_t Release(const phi::CustomPlace& place, phi::stream::stream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
diff --git a/paddle/phi/core/memory/malloc.h b/paddle/phi/core/memory/malloc.h
index eea770696608a2..0d064e28b8a119 100644
--- a/paddle/phi/core/memory/malloc.h
+++ b/paddle/phi/core/memory/malloc.h
@@ -22,6 +22,11 @@ limitations under the License. */
 #include "paddle/phi/core/memory/allocation/allocator.h"
 #include "paddle/phi/core/stream.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "xpu/runtime.h"
+#include "xpu/runtime_ex.h"
+#endif
+
 namespace paddle {
 namespace memory {
 
@@ -58,6 +63,11 @@ void EraseStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
 
 gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+bool RecordStream(std::shared_ptr<Allocation> allocation, XPUStream stream);
+#endif
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 extern uint64_t Release(const phi::CustomPlace& place,
                         phi::stream::stream_t stream);
diff --git a/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc b/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc
index 8104cbe80514b1..2956043e9bd18c 100644
--- a/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc
+++ b/paddle/phi/core/platform/device/xpu/xpu_resource_pool.cc
@@ -71,7 +71,12 @@ XpuEventResourcePool::XpuEventResourcePool() {
 
     auto deleter = [dev_idx](xpuEventHandle event) {
       phi::backends::xpu::XPUDeviceGuard guard(dev_idx);
-      xpu_event_destroy(event);
+      if (xpu_event_query(event) == XPU_SUCCESS) {
+        xpu_event_destroy(event);
+      } else {
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "event not finished, can not destroy"));
+      }
     };
 
     pool_.emplace_back(ResourcePool<XpuEventObject>::Create(creator, deleter));
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 98b432e39975a3..50c2d0801f0852 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1911,7 +1911,7 @@ void FusedMatmulInferMeta(const MetaTensor& x,
                           const std::vector<int>& fused_transpose_Y,
                           const std::vector<int>& fused_reshape_Out,
                           const std::vector<int>& fused_transpose_Out,
-                          const std::string& mkldnn_data_type,
+                          const std::string& onednn_data_type,
                           const float scale_x,
                           const float scale_y,
                           const float scale_scale_in_eltwise,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 419559f2be0c6d..b8cd51a2d7d052 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -384,7 +384,7 @@ void FusedMatmulInferMeta(const MetaTensor& x,
                           const std::vector<int>& fused_transpose_Y,
                           const std::vector<int>& fused_reshape_Out,
                           const std::vector<int>& fused_transpose_Out,
-                          const std::string& mkldnn_data_type,
+                          const std::string& onednn_data_type,
                           const float scale_x,
                           const float scale_y,
                           const float scale_scale_in_eltwise,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 0152ddcebc90bf..e7b6980f3b70bf 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -4800,7 +4800,7 @@ void MultiGruInferMeta(
     const std::string& gate_activation,
     int layers,
     bool origin_mode,
-    const std::string& mkldnn_data_type,
+    const std::string& onednn_data_type,
     float scale_data,
     float shift_data,
     bool force_fp32_output,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 6e840d67ead536..a3e6342b09f0a1 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -1080,7 +1080,7 @@ void MultiGruInferMeta(
     const std::string& gate_activation,
     int layers,
     bool origin_mode,
-    const std::string& mkldnn_data_type,
+    const std::string& onednn_data_type,
     float scale_data,
     float shift_data,
     bool force_fp32_output,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 3e1edcddd48d92..bb10157cfc69da 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -6098,7 +6098,7 @@ void FusedConvInferMeta(const MetaTensor& input,
                         const std::vector<int>& dilations,
                         int groups,
                         const std::string& data_format,
-                        const std::string& mkldnn_data_type,
+                        const std::string& onednn_data_type,
                         const std::string& fuse_activation,
                         bool fuse_residual_conn,
                         bool force_fp32_output,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 4060963ca9e0e9..67027f75097f7e 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -1231,7 +1231,7 @@ void FusedConvInferMeta(const MetaTensor& input,
                         const std::vector<int>& dilations,
                         int groups,
                         const std::string& data_format,
-                        const std::string& mkldnn_data_type,
+                        const std::string& onednn_data_type,
                         const std::string& fuse_activation,
                         bool fuse_residual_conn,
                         bool force_fp32_output,
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 96e93c7a97ff12..167be9f2e0d74e 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -584,7 +584,7 @@ static void SliceTensor(DenseTensor *x,
   DenseTensorMeta meta(share->dtype(),
                        new_dim,
                        share->layout(),
-                       offset * SizeOf(share->dtype()));
+                       offset * SizeOf(share->dtype()) + share->offset());
   x->set_meta(meta);
   x->ShareBufferWith(*(share), true);
   x->Resize(new_dim);
diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h
index 27de6176657e7f..e1de295be330b5 100644
--- a/paddle/phi/kernels/funcs/reduce_grad_functions.h
+++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h
@@ -38,10 +38,10 @@ void ReduceGradFunctor(const Context& dev_ctx,
   auto x_dims = input0.dims();
   auto reduced_dims_v = common::vectorize(x_dims);
   std::vector<int> dims_ref = dims;
-  Eigen::array<int, D> broadcast_dim;
+  Eigen::array<int64_t, D> broadcast_dim;
   for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
 
-  int broad_cast_times = 1;
+  int64_t broad_cast_times = 1;
   for (size_t i = 0; i < dims_ref.size(); ++i) {
     if (dims_ref[i] < 0) {
       dims_ref[i] = x_rank + dims_ref[i];
@@ -142,7 +142,7 @@ void LaunchReduceGradKernel(const Context& dev_ctx,
     auto& place = *dev_ctx.eigen_device();
     // *dev_ctx.eigen_device();
     auto broadcast_dim =
-        Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+        Eigen::array<int64_t, 1>({{static_cast<int64_t>(input0->numel())}});
     functor(place,
             &x,
             &x_reduce,
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 3de66d3d944ba0..fecd6bb71d3a54 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -198,6 +198,13 @@ void ArgFullSort(const phi::GPUContext& dev_ctx,
                  const int64_t num_rows,
                  const int64_t num_cols,
                  const bool descending) {
+  PADDLE_ENFORCE_LE(num_cols,
+                    std::numeric_limits<int>::max(),
+                    ::common::errors::PreconditionNotMet(
+                        "The dimension being sorted should be less than "
+                        "2^31, but got %lld. Please check the input tensor. ",
+                        num_cols));
+
   auto cu_stream = dev_ctx.stream();
   auto ComputeBlockSize = [](IndType col) {
     if (col > 512)
@@ -228,8 +235,14 @@ void ArgFullSort(const phi::GPUContext& dev_ctx,
   const int64_t total_elements = num_cols * num_rows;
   const int64_t segment_size = num_cols;
   const int64_t element_per_call = std::min(max_elements, total_elements);
+
+  // make sure element_per_call >= segment_size
+  const int64_t adjusted_elements_per_call =
+      std::max(max_elements, segment_size);
+
   // make sure batch size is the multiple of segment_size
-  const int64_t batch_size = (element_per_call / segment_size) * segment_size;
+  const int64_t batch_size =
+      (adjusted_elements_per_call / segment_size) * segment_size;
   int64_t offset = 0;
   DenseTensor input_indices;
 
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index a55458d59a2d57..1993caec70adb3 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -63,6 +63,18 @@ struct PowFunctor {
   Ty p_order_;
 };
 
+template <typename Tx,
+          typename Ty,
+          typename Tout>  // Tx is high precision, Tout is low/out precision
+struct PowFunctorHighPrecision {
+  HOSTDEVICE explicit inline PowFunctorHighPrecision(const Ty& p_order)
+      : p_order_(p_order) {}
+  HOSTDEVICE inline Tx operator()(const Tx x) const {
+    return static_cast<Tout>(pow(static_cast<Ty>(x), p_order_));
+  }
+  Ty p_order_;
+};
+
 template <typename T, typename Functor>
 __global__ void ReduceSumWithSubtract(
     const T* x, const T* y, T* out, int64_t N, Functor func) {
@@ -126,16 +138,17 @@ void DistKernel(const Context& dev_ctx,
   DenseTensor intermediate;
   const T* x_ptr = x.data<T>();
   const T* y_ptr = y.data<T>();
+
   T* o_ptr = dev_ctx.template Alloc<T>(out);
   auto stream = dev_ctx.stream();
 
   auto xdim = x.dims();
   if (xdim == y.dims()) {  // same shape
-    auto n = x.numel();
+    int64_t n = x.numel();
+
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n);
     intermediate.Resize(common::make_ddim({config.block_per_grid.x}));
     T* i_ptr = dev_ctx.template Alloc<T>(&intermediate);
-
     std::vector<int64_t> axis_dims = {static_cast<int64_t>(-1)};
     std::vector<int> reduce_axis =
         funcs::details::GetReduceDim(axis_dims, xdim.size(), true);
@@ -166,15 +179,23 @@ void DistKernel(const Context& dev_ctx,
       ReduceSumWithSubtract<T>
           <<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
               x_ptr, y_ptr, i_ptr, n, OtherOrderFunctor<T, MT>(p_order));
-      phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<MT>>(
-          dev_ctx, intermediate, out, kps::IdentityFunctor<MT>(), reduce_axis);
-
-      const DenseTensor* tmp_norm = out;
-      std::vector<const DenseTensor*> ins = {tmp_norm};
+      DenseTensor out_other;
+      out_other.Resize(out->dims());
+      dev_ctx.template Alloc<MT>(&out_other);
+
+      phi::funcs::
+          ReduceKernel<T, MT, kps::AddFunctor, kps::IdentityFunctor<MT>>(
+              dev_ctx,
+              intermediate,
+              &out_other,
+              kps::IdentityFunctor<MT>(),
+              reduce_axis);
+      std::vector<const DenseTensor*> ins = {&out_other};
       std::vector<DenseTensor*> outs = {out};
-      MT p_order_ = static_cast<MT>(static_cast<MT>(1.) / p_order);
+
+      MT p_order_ = static_cast<MT>(1.f / p_order);
       phi::funcs::ElementwiseKernel<T>(
-          dev_ctx, ins, &outs, PowFunctor<T, MT>(p_order_));
+          dev_ctx, ins, &outs, PowFunctorHighPrecision<MT, MT, T>(p_order_));
     }
 
   } else {
diff --git a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
index fdfed25b3dda8f..5efd6a36a5399f 100644
--- a/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_grad_kernel.cu
@@ -42,10 +42,12 @@ struct AbsMaxAndMinGradFunctor {
 
 template <typename T>
 struct PNormGradFunctor {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   HOSTDEVICE explicit inline PNormGradFunctor(float porder, float eps) {
-    this->porder = static_cast<T>(porder - 1.);
-    this->eps = static_cast<T>(eps);
+    this->porder = static_cast<MT>(porder - 1.);
+    this->eps = static_cast<MT>(eps);
   }
+
   template <typename Context,
             typename X,
             typename Y,
@@ -59,12 +61,33 @@ struct PNormGradFunctor {
                   DY* dy,
                   const Dim& dim,
                   int size) {
+    auto x_mt = x->template cast<MT>();
+    auto y_mt = y->template cast<MT>();
+    auto dy_mt = dy->template cast<MT>();
+
+    auto norm_pow = y_mt.pow(-this->porder);
+    auto mask_norm_nonzero = (y_mt != static_cast<MT>(0)).template cast<MT>();
+
+    // Set to 0 where porder < 0 and x == 0
+    MT zero = static_cast<MT>(0);
+    auto mask_x_zero = (x_mt == zero).template cast<MT>();
+
+    MT is_porder_negative =
+        this->porder < zero ? static_cast<MT>(1) : static_cast<MT>(0);
+    auto invalid_mask = (mask_x_zero * is_porder_negative);
+    auto safe_pow =
+        x_mt.abs().pow(this->porder) * (static_cast<MT>(1) - invalid_mask);
+
     dx->device(place) =
-        (*x).abs().pow(this->porder) * (*x).sign() * dy->broadcast(dim) *
-        (*y + y->constant(eps)).pow(-this->porder).broadcast(dim);
+        (safe_pow * x_mt.sign() * dy_mt.broadcast(dim) *
+         norm_pow.broadcast(dim) *
+         mask_norm_nonzero.broadcast(dim)  // Mask out positions where norm == 0
+         )
+            .template cast<T>();
   }
-  T porder;
-  T eps;
+
+  MT porder;
+  MT eps;
 };
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/p_norm_kernel.cu b/paddle/phi/kernels/gpu/p_norm_kernel.cu
index 9b0515feb33544..8809b082b7a826 100644
--- a/paddle/phi/kernels/gpu/p_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_norm_kernel.cu
@@ -124,31 +124,38 @@ void PNormKernel(const Context& dev_ctx,
     phi::funcs::ElementwiseKernel<T>(
         dev_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
 #else
+    DenseTensor out_temp;
+    out_temp.Resize(out_norm->dims());
+    dev_ctx.template Alloc<MT>(&out_temp);
+
     if (porder == 1.0) {
       // fast 1-norm
       phi::funcs::ReduceKernel<T, T, kps::AddFunctor, FabsFunctor<T>>(
           dev_ctx, *in_x, out_norm, FabsFunctor<T>(), reduce_axis);
     } else if (porder == 2.0) {
       // fast 2-norm
-      phi::funcs::ReduceKernel<T, T, kps::AddFunctor, SquareFunctor<T>>(
-          dev_ctx, *in_x, out_norm, SquareFunctor<T>(), reduce_axis);
+      phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, SquareFunctor<MT>>(
+          dev_ctx, *in_x, &out_temp, SquareFunctor<MT>(), reduce_axis);
     } else if (porder == 3.0) {
       // fast 3-norm
-      phi::funcs::ReduceKernel<T, T, kps::AddFunctor, FabsCubicFunctor<T>>(
-          dev_ctx, *in_x, out_norm, FabsCubicFunctor<T>(), reduce_axis);
+      phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, FabsCubicFunctor<MT>>(
+          dev_ctx, *in_x, &out_temp, FabsCubicFunctor<MT>(), reduce_axis);
     } else {
       // vanilla norm
-      phi::funcs::ReduceKernel<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
-          dev_ctx, *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis);
+      phi::funcs::ReduceKernel<T, MT, kps::AddFunctor, UnsignedPowFunctor<MT>>(
+          dev_ctx,
+          *in_x,
+          &out_temp,
+          UnsignedPowFunctor<MT>(porder),
+          reduce_axis);
     }
 
     if (porder != 1.0) {
-      // save computation when porder is 1.0
-      const DenseTensor* tmp_norm = out_norm;
-      std::vector<const DenseTensor*> ins = {tmp_norm};
+      std::vector<const DenseTensor*> ins = {&out_temp};
       std::vector<DenseTensor*> outs = {out_norm};
+      MT p_order_ = static_cast<MT>(1.f / porder);
       phi::funcs::ElementwiseKernel<T>(
-          dev_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
+          dev_ctx, ins, &outs, UnsignedPowFunctor<MT>(p_order_));
     }
 #endif
   }
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 91141b09aae8ce..3cd9d0f0aaeb47 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -37,10 +37,207 @@ limitations under the License. */
 #endif
 #include "paddle/phi/kernels/full_kernel.h"
 
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+// clang-format off
+#include "paddle/phi/backends/dynload/cudnn_frontend.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h"
+// clang-format on
+#endif
+
 namespace phi {
 
 using GPUDNNDataLayout = phi::backends::gpu::DataLayout;
 
+template <typename T, typename Context>
+void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
+                                    const DenseTensor* filter,
+                                    const Context& dev_ctx,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& padding_common,
+                                    const std::vector<int>& dilations_,
+                                    GPUDNNDataLayout data_layout,
+                                    GPUDNNDataLayout layout,
+                                    bool exhaustive_search,
+                                    bool deterministic,
+                                    int groups,
+                                    DenseTensor* transformed_out) {
+  int iwo_groups = 1;
+  int c_groups = groups;
+  groups = 1;
+  size_t workspace_size = 0;
+
+  const T* x_data = transformed_x->data<T>();
+  const T* filter_data = filter->data<T>();
+  T* transformed_out_data = transformed_out->data<T>();
+#ifdef PADDLE_WITH_HIP
+  miopenConvBwdDataAlgorithm_t algo{};
+#else
+  cudnnConvolutionBwdDataAlgo_t algo{};
+#endif
+  // ------------------- cudnn conv algorithm ---------------------
+  auto handle = dev_ctx.cudnn_handle();
+  auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout);
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  // ------------------- cudnn descriptors ---------------------
+  ConvArgs args{handle,
+                transformed_out,
+                filter,
+                transformed_x,
+                strides,
+                padding_common,
+                dilations_,
+                dtype,
+                groups,
+                data_layout};
+  args.idesc.set(*transformed_out, iwo_groups);
+  args.wdesc.set(*filter, layout_tensor, iwo_groups);
+  args.odesc.set(*transformed_x, iwo_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 phi::AllowTF32Cudnn(),
+                 c_groups);
+
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
+  using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+  bwd_result.algo = search::Find<T>(
+      args, exhaustive_search, deterministic, workspace_size, dev_ctx);
+#else
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
+  using search = SearchAlgorithm<ConvKind::kBackwardData>;
+  bwd_result =
+      search::Find<T>(dev_ctx, args, exhaustive_search, deterministic, false);
+  workspace_size =
+      std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
+#endif
+
+  // ------------------- cudnn conv transpose forward ---------------------
+  int x_offset = transformed_x->numel() / transformed_x->dims()[0] / groups;
+  int out_offset =
+      transformed_out->numel() / transformed_out->dims()[0] / groups;
+  int filter_offset = filter->numel() / groups;
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+#ifdef PADDLE_WITH_HIP
+  for (int g = 0; g < groups; g++) {
+    auto cudnn_func = [&](void* cudnn_workspace) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+          handle,
+          &alpha,
+          args.odesc.desc(),
+          x_data + x_offset * g,
+          args.wdesc.desc(),
+          filter_data + filter_offset * g,
+          args.cdesc.desc(),
+          bwd_result.algo,
+          &beta,
+          args.idesc.desc(),
+          transformed_out_data + out_offset * g,
+          cudnn_workspace,
+          workspace_size));
+    };
+    workspace_handle.RunFunc(cudnn_func, workspace_size);
+  }
+#else
+  ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                args,
+                                                bwd_result,
+                                                x_data,
+                                                filter_data,
+                                                transformed_out_data,
+                                                groups,
+                                                out_offset,
+                                                filter_offset,
+                                                x_offset,
+                                                workspace_size,
+                                                &workspace_handle,
+                                                false);
+#endif
+}
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+template <typename T, typename Context>
+void ConvTransposeCudnnKernelImplV8(const DenseTensor* transformed_x,
+                                    const DenseTensor* filter,
+                                    const Context& dev_ctx,
+                                    const std::vector<int>& strides,
+                                    const std::vector<int>& padding_common,
+                                    const std::vector<int>& dilations_,
+                                    GPUDNNDataLayout data_layout,
+                                    GPUDNNDataLayout layout,
+                                    bool exhaustive_search,
+                                    bool deterministic,
+                                    int groups,
+                                    DenseTensor* transformed_out) {
+  auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8(
+      phi::autotune::AlgorithmType::kConvBackwardDataV8);
+
+  T* input_data = const_cast<T*>(transformed_x->data<T>());
+  T* filter_data = const_cast<T*>(filter->data<T>());
+  T* output_data = transformed_out->data<T>();
+  cudnnHandle_t handle = const_cast<cudnnHandle_t>(dev_ctx.cudnn_handle());
+  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+  auto layout_format = phi::backends::gpu::GetCudnnTensorFormat(layout);
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+
+  float alpha = 1.0f;
+  float beta = 0.0f;
+
+  using helper = CudnnFrontendConvHelper;
+  auto op_graph = helper::BuildConvOperationGraph<
+      CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR>(
+      transformed_out,
+      transformed_x,
+      filter,
+      layout_format,
+      strides,
+      padding_common,
+      dilations_,
+      dtype,
+      handle,
+      alpha,
+      beta);
+  if (plan_cache.FindPlan(op_graph, handle)) {
+    const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
+    int64_t workspace_size = 0;
+    plan_cache.GetPlanAndWorkspaceSize(
+        op_graph, &cached_plan, &workspace_size, handle);
+    helper::ExecutePlan(handle,
+                        &workspace_handle,
+                        output_data,
+                        input_data,
+                        filter_data,
+                        cached_plan->get_raw_desc(),
+                        workspace_size);
+    return;
+  }
+
+  auto plans = helper::FindExecutionPlans(&op_graph,
+                                          exhaustive_search,
+                                          deterministic,
+                                          output_data,
+                                          input_data,
+                                          filter_data,
+                                          handle,
+                                          &workspace_handle);
+
+  helper::ExecutePlansAndCache(handle,
+                               &workspace_handle,
+                               output_data,
+                               input_data,
+                               filter_data,
+                               &plans,
+                               exhaustive_search,
+                               op_graph,
+                               &plan_cache);
+}
+#endif
+
 template <typename T, typename Context>
 void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
                                   const DenseTensor& x,
@@ -57,15 +254,28 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
         dev_ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
     return;
   }
+
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
+                    false,
+                    common::errors::InvalidArgument(
+                        "Can't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
   std::vector<int> paddings_ = paddings;
-  std::vector<int> dilations_ =
-      dilations;  // cudnn v5 does not support dilations
-  const T* filter_data = filter.data<T>();
+  std::vector<int> dilations_ = dilations;
   const GPUDNNDataLayout data_layout =
       (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
                              : GPUDNNDataLayout::kNHWC);
-  std::vector<int> x_vec = common::vectorize<int>(x.dims());
-  std::vector<int> out_vec = common::vectorize<int>(out->dims());
+  std::vector<int64_t> x_vec = common::vectorize<int64_t>(x.dims());
+  std::vector<int64_t> out_vec = common::vectorize<int64_t>(out->dims());
   // if channel_last, transpose to channel_first
   DenseTensor x_transpose;
   if (data_layout == GPUDNNDataLayout::kNHWC) {
@@ -106,7 +316,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
   std::vector<int> padding_common(data_dim, 0);
   if (!is_sys_pad) {
     std::vector<int> padding_diff(data_dim);
-    std::vector<int> new_x_shape_vec(data_dim + 2);
+    std::vector<int64_t> new_x_shape_vec(data_dim + 2);
     new_x_shape_vec[0] = x_dims[0];
     new_x_shape_vec[1] = x_dims[1];
 
@@ -158,10 +368,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
     axes[i] = i + 2;
   }
 
-  const T* x_data = transformed_x.data<T>();
-  x_vec = common::vectorize<int>(transformed_x.dims());
+  x_vec = common::vectorize<int64_t>(transformed_x.dims());
 
-  std::vector<int> transformed_out_vec = out_vec;
+  std::vector<int64_t> transformed_out_vec = out_vec;
   for (size_t i = 0; i < data_dim; ++i) {
     transformed_out_vec[i + 2] =
         out_vec[i + 2] + (x_pad[2 * i + 4] + x_pad[2 * i + 5]) * strides[i] -
@@ -177,119 +386,55 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
     transformed_out.ShareDataWith(*out);
     transformed_out.Resize(common::make_ddim(transformed_out_vec));
   }
-  T* transformed_out_data = transformed_out.data<T>();
-
-#ifndef PADDLE_WITH_HIP
-  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_x);
-  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter);
-  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_out);
-#endif
 
   GPUDNNDataLayout layout;
-
-  int iwo_groups = groups;
-  int c_groups = 1;
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
-  iwo_groups = 1;
-  c_groups = groups;
-  groups = 1;
-#endif
-
   if (strides.size() == 2U) {
     layout = GPUDNNDataLayout::kNCHW;
   } else {
     layout = GPUDNNDataLayout::kNCDHW;
   }
 
-  size_t workspace_size = 0;
-#ifdef PADDLE_WITH_HIP
-  miopenConvBwdDataAlgorithm_t algo{};
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+  if (dynload::IsCudnnFrontendEnabled())
+    ConvTransposeCudnnKernelImplV8<T>(&transformed_x,
+                                      &filter,
+                                      dev_ctx,
+                                      strides,
+                                      padding_common,
+                                      dilations_,
+                                      data_layout,
+                                      layout,
+                                      exhaustive_search,
+                                      deterministic,
+                                      groups,
+                                      &transformed_out);
+  else
+    ConvTransposeCudnnKernelImplV7<T>(&transformed_x,
+                                      &filter,
+                                      dev_ctx,
+                                      strides,
+                                      padding_common,
+                                      dilations_,
+                                      data_layout,
+                                      layout,
+                                      exhaustive_search,
+                                      deterministic,
+                                      groups,
+                                      &transformed_out);
 #else
-  cudnnConvolutionBwdDataAlgo_t algo{};
+  ConvTransposeCudnnKernelImplV7<T>(&transformed_x,
+                                    &filter,
+                                    dev_ctx,
+                                    strides,
+                                    padding_common,
+                                    dilations_,
+                                    data_layout,
+                                    layout,
+                                    exhaustive_search,
+                                    deterministic,
+                                    groups,
+                                    &transformed_out);
 #endif
-  // ------------------- cudnn conv algorithm ---------------------
-  auto handle = dev_ctx.cudnn_handle();
-  auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout);
-  bool deterministic = FLAGS_cudnn_deterministic;
-
-  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
-  // ------------------- cudnn descriptors ---------------------
-  ConvArgs args{handle,
-                &transformed_out,
-                &filter,
-                &transformed_x,
-                strides,
-                padding_common,
-                dilations_,
-                dtype,
-                groups,
-                data_layout};
-  args.idesc.set(transformed_out, iwo_groups);
-  args.wdesc.set(filter, layout_tensor, iwo_groups);
-  args.odesc.set(transformed_x, iwo_groups);
-  args.cdesc.set(dtype,
-                 padding_common,
-                 strides,
-                 dilations_,
-                 phi::AllowTF32Cudnn(),
-                 c_groups);
-
-#ifdef PADDLE_WITH_HIP
-  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
-  using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
-  workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
-  bwd_result.algo =
-      search::Find<T>(args, false, deterministic, workspace_size, dev_ctx);
-#else
-  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
-  using search = SearchAlgorithm<ConvKind::kBackwardData>;
-  bwd_result = search::Find<T>(dev_ctx, args, false, deterministic, false);
-  workspace_size =
-      std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
-#endif
-
-  // ------------------- cudnn conv transpose forward ---------------------
-  int x_offset = transformed_x.numel() / transformed_x.dims()[0] / groups;
-  int out_offset = transformed_out.numel() / transformed_out.dims()[0] / groups;
-  int filter_offset = filter.numel() / groups;
-  ScalingParamType<T> alpha = 1.0f;
-  ScalingParamType<T> beta = 0.0f;
-  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-#ifdef PADDLE_WITH_HIP
-  for (int g = 0; g < groups; g++) {
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
-          handle,
-          &alpha,
-          args.odesc.desc(),
-          x_data + x_offset * g,
-          args.wdesc.desc(),
-          filter_data + filter_offset * g,
-          args.cdesc.desc(),
-          bwd_result.algo,
-          &beta,
-          args.idesc.desc(),
-          transformed_out_data + out_offset * g,
-          cudnn_workspace,
-          workspace_size));
-    };
-    workspace_handle.RunFunc(cudnn_func, workspace_size);
-  }
-#else   // PADDLE_WITH_HIP
-  ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
-                                                args,
-                                                bwd_result,
-                                                x_data,
-                                                filter_data,
-                                                transformed_out_data,
-                                                groups,
-                                                out_offset,
-                                                filter_offset,
-                                                x_offset,
-                                                workspace_size,
-                                                &workspace_handle,
-                                                false);
-#endif  // PADDLE_WITH_HIP
 
   if (!is_sys_pad && strides.size() == 2U) {
     funcs::Slice<Context, T, 4>(
diff --git a/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h b/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h
index d6b9e1976a1270..4d78b934ab17b1 100644
--- a/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h
+++ b/paddle/phi/kernels/impl/accuracy_check_kernel_impl.h
@@ -108,7 +108,7 @@ struct AccuracyCheckFunctor<phi::CPUContext, phi::dtype::complex<T>> {
     for (int i = 0; i < num; i++) {
       out_data[i] = true;
     }
-    bool val;
+    bool val = false;
     int res_index = -1;
     for (int i = 0; i < num; i++) {
       const phi::dtype::complex<T> a = in_a[i], b = in_b[i];
diff --git a/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h b/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h
index 0354e28761ab9a..8aa0c09d28ff05 100644
--- a/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_kernel_impl.h
@@ -44,7 +44,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
   std::vector<int64_t> resize_dims;
   std::vector<int64_t> recover_shape;
   std::vector<int64_t> t_shape = common::vectorize<int64_t>(t.dims());
-  for (int i = 0; i < op_label.size(); i++) {
+  for (size_t i = 0; i < op_label.size(); i++) {
     int c = op_label[i];
     if (label2type[c] == LabelType::Reduction) {
       repeat_times.push_back(label2shape[c]);
@@ -64,7 +64,7 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
                         "shape size: `%d`, but got label nums: `%d`",
                         t_shape.size(),
                         op_label.size()));
-  for (int i = 0; i < op_label.size(); i++) {
+  for (size_t i = 0; i < op_label.size(); i++) {
     int c = op_label[i];
     if (label2type[c] == LabelType::Contraction &&
         t_shape[i] != label2shape[c]) {
diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml
index 652cb7b078fa50..1bde7d4f727ce3 100755
--- a/paddle/phi/ops/yaml/op_compat.yaml
+++ b/paddle/phi/ops/yaml/op_compat.yaml
@@ -21,7 +21,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : accuracy
   inputs :
@@ -42,7 +42,7 @@
     out : Out
   backward : acosh_grad
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : adadelta_ (adadelta)
   inputs :
@@ -104,7 +104,7 @@
   attrs :
     {scale_x : Scale_x, scale_y : Scale_y, scale_out : Scale_out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
 
@@ -114,7 +114,7 @@
   outputs:
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : add_position_encoding
   backward: add_position_encoding_grad
@@ -132,7 +132,7 @@
   attrs :
     {alpha : Alpha, beta : Beta}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : affine_channel
   backward: affine_channel_grad
@@ -163,7 +163,7 @@
     out : Out
   manual_signature : [all]
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : allclose
   inputs :
@@ -187,7 +187,7 @@
   attrs:
     { axis : dim,  keepdim : keep_dim }
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   get_expected_kernel_type :
     amax_grad : GetReduceGradExpectedKernelType
   manual_signature : [amax]
@@ -201,7 +201,7 @@
   attrs:
     { axis : dim,  keepdim : keep_dim }
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   get_expected_kernel_type :
     amin_grad : GetReduceGradExpectedKernelType
   manual_signature : [amin]
@@ -219,7 +219,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : any (reduce_any)
   inputs :
@@ -229,7 +229,7 @@
   attrs:
     { axis : dim,  keepdim : keep_dim }
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   get_expected_kernel_type :
     any : GetReduceOpUseInputPlaceExpectedKernelType
   manual_signature : [any]
@@ -310,7 +310,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : assert
   inputs :
@@ -357,7 +357,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : attention_lstm
   backward: attention_lstm_grad
@@ -381,7 +381,7 @@
   attrs :
     {alpha : Alpha, beta : Beta}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : barrier
   inputs :
@@ -414,7 +414,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool fuse_with_relu = false]
 
 - op : bce_loss
   backward : bce_loss_grad
@@ -444,7 +444,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : bilinear (bilinear_tensor_product)
   backward: bilinear_grad (bilinear_tensor_product_grad)
@@ -462,7 +462,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : bincount
   inputs :
@@ -564,7 +564,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : ceil
   backward : ceil_grad
@@ -573,7 +573,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : celu
   backward : celu_grad, celu_double_grad(celu_grad_grad)
@@ -622,7 +622,7 @@
       data_type :  float
       tensor_name : Max
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : clip_by_norm
   inputs :
@@ -667,7 +667,7 @@
       tensor_name : AxisTensor
   drop_empty_grad : [x_grad]
   extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
   get_expected_kernel_type :
     concat : GetConcatExpectedKernelType
 
@@ -689,7 +689,7 @@
   outputs :
     out : Output
   extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_addto = false,
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, bool use_addto = false,
              bool force_fp32_output = false,
              int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false, str mkldnn_data_type = "float32"]
   get_expected_kernel_type :
@@ -707,7 +707,7 @@
       support_tensor : true
   extra :
     inputs : [bias]
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false,
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, bool force_fp32_output = false,
              str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
              int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
@@ -722,7 +722,7 @@
       data_type : int
       support_tensor : true
   extra :
-    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = true, bool force_fp32_output = false,
+    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = true, bool use_onednn = false, bool force_fp32_output = false,
              str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f]
 
@@ -733,7 +733,7 @@
   outputs :
     out : Output
   extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
              bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false,
              int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
@@ -747,7 +747,7 @@
   outputs :
     out : Output
   extra :
-    attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
+    attrs : [bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
 
 - op : correlation
   backward : correlation_grad
@@ -756,7 +756,7 @@
   outputs :
     out : Output
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : cos
   backward : cos_grad, cos_double_grad, cos_triple_grad
@@ -765,7 +765,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : cosh
   backward : cosh_grad
@@ -774,7 +774,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : crop (crop_tensor)
   backward : crop_grad (crop_tensor_grad)
@@ -837,7 +837,7 @@
 - op : data_norm
   backward : data_norm_grad
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : decode_jpeg
   inputs :
@@ -861,7 +861,7 @@
   attrs :
     {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
   extra :
-    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, bool use_onednn = false,
              bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
              bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
@@ -882,7 +882,7 @@
       support_tensor : true
   extra :
     inputs : [bias]
-    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false,
+    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool use_onednn = false, bool force_fp32_output = false,
              str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
              int workspace_size_MB = phi::backends::gpu::GetDefaultConvWorkspaceSizeLimitMB()]
@@ -979,7 +979,7 @@
   outputs :
     out: Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : dot
@@ -1069,7 +1069,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [elementwise_pow]
@@ -1081,7 +1081,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : embedding (lookup_table_v2)
   backward : embedding_grad (lookup_table_v2_grad)
@@ -1137,7 +1137,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : expand (expand_v2)
   backward : expand_grad (expand_v2_grad), expand_double_grad(expand_v2_double_grad)
@@ -1153,7 +1153,7 @@
       tensor_name : Shape
       tensors_name : expand_shapes_tensor
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
   manual_signature : [expand, expand_grad]
 
 - op : expand_as (expand_as_v2)
@@ -1170,7 +1170,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : exponential_ (exponential)
   backward : exponential__grad (exponential_grad)
@@ -1280,7 +1280,7 @@
   attrs :
     {scale_in : Scale_in, scale_out : Scale_out, scale_weights : Scale_weights}
   extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false, str fuse_activation = "" , float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, 'int[] fused_reshape2_shape = {}']
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32", float Scale_in = 1.0f, "float[] Scale_weights = {1.0f}", float Scale_out = 1.0f, bool force_fp32_output = false, str fuse_activation = "" , float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float fused_output_scale = 1.0f, 'int[] fused_reshape2_shape = {}']
 
 - op : feed
   outputs: {out: Out}
@@ -1357,7 +1357,7 @@
     {start_axis : start_axis, stop_axis : stop_axis}
   extra :
     outputs : [xshape]
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
   manual_signature : [flatten, flatten_grad]
 
 - op : flip
@@ -1373,7 +1373,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : floor_divide (elementwise_floordiv)
   inputs :
@@ -1381,7 +1381,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [floor_divide]
@@ -1393,7 +1393,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [fmax]
@@ -1405,7 +1405,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [fmin]
@@ -1439,13 +1439,13 @@
     frobenius_norm : GetReduceExpectedKernelType
     frobenius_norm_grad : GetReduceGradExpectedKernelType
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : full (fill_constant)
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : full_like (fill_any_like)
   inputs :
@@ -1565,7 +1565,7 @@
     {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
   extra :
     attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
-             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : fused_conv2d_add_act
   inputs :
@@ -1577,7 +1577,7 @@
     output : Output
     outputs : Outputs
   extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, bool use_onednn = false,
              bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
              str fuse_activation = "", float fuse_beta = 0.0f, bool use_addto = false,
              bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
@@ -1594,7 +1594,7 @@
     {scale_in : Scale_in, scale_out : Scale_out, scale_in_eltwise : Scale_in_eltwise, scale_weights : Scale_weights}
   extra :
     attrs : [bool use_cudnn = false, float fuse_alpha = 0.0f, float fuse_beta = 0.0f, float Scale_in = 1.0f,
-             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+             float Scale_out = 1.0f, float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : fused_elementwise_add
   inputs :
@@ -1741,7 +1741,7 @@
   attrs :
     {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", float Scale_data = 1.0f, float Shift_data = 0.0f, 'float[] Scale_weights = {1.0f}']
 
 - op : fusion_lstm
   inputs :
@@ -1765,7 +1765,7 @@
   attrs :
     {scale_data : Scale_data, shift_data : Shift_data, scale_weights : Scale_weights}
   extra :
-    attrs : [bool use_mkldnn = true, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = true, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : fusion_repeated_fc_relu
   inputs :
@@ -1834,7 +1834,7 @@
       tensor_name : ShapeTensor
       tensors_name : ShapeTensorList
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   manual_signature : [gaussian]
 
 - op : gelu
@@ -1844,7 +1844,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : generate_proposals(generate_proposals_v2)
   inputs :
@@ -1873,7 +1873,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : graph_khop_sampler
@@ -1948,7 +1948,7 @@
     out : Out
   backward : hardswish_grad (hard_swish_grad)
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   manual_signature : [hardswish]
 
 - op : hardtanh (brelu)
@@ -1973,7 +1973,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
 
@@ -2137,7 +2137,7 @@
     out : Out
   backward : l1_norm_grad
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : label_smooth
   inputs :
@@ -2162,7 +2162,7 @@
     mean : Mean
     variance : Variance
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false]
   get_expected_kernel_type :
     layer_norm : GetLayerNormExpectedKernelType
 
@@ -2175,7 +2175,7 @@
   attrs:
     negative_slope : alpha
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : legacy_bilinear_interp (bilinear_interp)
   backward : legacy_bilinear_interp_grad (bilinear_interp_grad)
@@ -2186,7 +2186,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : legacy_expand (expand)
   backward : legacy_expand_grad (expand_grad)
@@ -2202,7 +2202,7 @@
       tensor_name : ExpandTimes
       tensors_name : expand_times_tensor
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
   manual_signature : [legacy_expand, legacy_expand_grad]
 
 - op : legacy_generate_proposals(generate_proposals)
@@ -2236,7 +2236,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : legacy_reshape (reshape)
   backward : legacy_reshape_grad (reshape_grad)
@@ -2251,7 +2251,7 @@
       tensor_name : Shape
       tensors_name : ShapeTensor
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
 
 - op : lerp
   backward : lerp_grad
@@ -2287,7 +2287,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : linspace
   inputs :
@@ -2309,7 +2309,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : log10
   backward : log10_grad
@@ -2318,7 +2318,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : log1p
   backward : log1p_grad
@@ -2327,7 +2327,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : log2
   backward : log2_grad
@@ -2336,7 +2336,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : log_loss
   backward : log_loss_grad
@@ -2352,7 +2352,7 @@
   outputs :
     out: Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : logcumsumexp
   backward : logcumsumexp_grad
@@ -2398,7 +2398,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : logsumexp
   backward : logsumexp_grad
@@ -2424,7 +2424,7 @@
   outputs :
     {out : Out, mid_out : MidOut}
   extra :
-    attrs : [bool use_mkldnn = false, bool is_test = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool is_test = false]
 
 - op : lstsq
   inputs :
@@ -2473,7 +2473,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool force_fp32_output = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool force_fp32_output = false]
   complex_promote : [X, Y]
 
 - op : matmul_with_flatten (mul)
@@ -2483,7 +2483,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}',
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}',
              float scale_out = 1.0f, bool force_fp32_output = false]
 
 - op : matrix_nms
@@ -2516,7 +2516,7 @@
   outputs:
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   int_array:
     axis :
       data_type : int
@@ -2549,7 +2549,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [maximum]
@@ -2569,7 +2569,7 @@
   attrs :
     {axis : dim, keepdim : keep_dim}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   int_array:
       axis :
         data_type : int
@@ -2637,7 +2637,7 @@
   attrs:
     { axis : dim,  keepdim : keep_dim}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   int_array:
     axis :
       data_type : int
@@ -2654,7 +2654,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [minimum]
@@ -2666,7 +2666,7 @@
   outputs:
     out: Out
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : mode
   backward : mode_grad
@@ -2751,7 +2751,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : mv
@@ -2781,7 +2781,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : nll_loss
   backward : nll_loss_grad
@@ -2863,7 +2863,7 @@
 - op : pad2d
   backward : pad2d_grad
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : pad3d
   backward : pad3d_grad, pad3d_double_grad
@@ -2878,7 +2878,7 @@
   attrs :
     pad_value : value
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : partial_allgather
   inputs :
@@ -2894,7 +2894,7 @@
     out : Out
   drop_empty_grad : [x_grad]
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : partial_recv
   outputs :
@@ -2908,7 +2908,7 @@
     out : Out
   drop_empty_grad : [x_grad]
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : pixel_shuffle
   backward : pixel_shuffle_grad
@@ -2947,7 +2947,7 @@
     pool2d_grad : GetPoolExpectedKernelType
     pool2d_double_grad : GetPoolDoubleGradExpectedKernelType
   extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false,
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false,
               str mkldnn_data_type = "float32", bool is_test = false]
 
 - op : pool3d
@@ -2962,7 +2962,7 @@
     pool3d : GetPoolExpectedKernelType
     pool3d_grad : GetPoolExpectedKernelType
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : pow
   backward : pow_grad, pow_double_grad, pow_triple_grad
@@ -2984,7 +2984,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false]
 
 - op : print
   inputs :
@@ -2998,7 +2998,7 @@
   outputs :
     {out: Boxes, var: Variances}
   extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
 
 - op : prod (reduce_prod)
   backward : prod_grad (reduce_prod_grad)
@@ -3013,7 +3013,7 @@
       data_type : int
       support_tensor : true
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   get_expected_kernel_type :
     prod : GetReduceExpectedKernelType
     prod_grad : GetReduceGradExpectedKernelType
@@ -3093,7 +3093,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : relu
   backward : relu_grad, relu_double_grad (relu_grad_grad)
@@ -3102,7 +3102,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"]
 
 - op : relu6
   backward : relu6_grad
@@ -3111,7 +3111,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, float threshold = 6.0]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, float threshold = 6.0]
 
 - op : remainder (elementwise_mod)
   inputs :
@@ -3119,7 +3119,7 @@
   outputs :
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
   manual_signature : [remainder]
@@ -3131,7 +3131,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : repeat_interleave
   inputs :
@@ -3180,7 +3180,7 @@
       tensor_name : Shape
       tensors_name : ShapeTensor
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
 
 - op : resnet_basic_block
   backward: resnet_basic_block_grad
@@ -3253,7 +3253,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : row_conv
   backward : row_conv_grad
@@ -3269,7 +3269,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : save_combine
   inputs :
@@ -3289,7 +3289,7 @@
       data_type : float
       support_tensor : false
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : scatter
   backward : scatter_grad
@@ -3398,7 +3398,7 @@
   get_expected_kernel_type :
     sgd_ : GetSgdExpectedKernelType
   extra :
-    attrs : [bool use_mkldnn=false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : shape
   inputs :
@@ -3408,7 +3408,7 @@
 
 - op : shape
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : shard_index
   inputs :
@@ -3443,7 +3443,7 @@
   outputs:
     {out : Out}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : sigmoid
   backward : sigmoid_grad, sigmoid_double_grad (sigmoid_grad_grad), sigmoid_triple_grad
@@ -3452,7 +3452,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false, str mkldnn_data_type = "float32"]
 
 - op : sign
   backward : sign_grad
@@ -3468,7 +3468,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : sin
   backward : sin_grad, sin_double_grad, sin_triple_grad
@@ -3477,7 +3477,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : sinh
   backward : sinh_grad
@@ -3486,7 +3486,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : slice
   backward : slice_grad
@@ -3495,7 +3495,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
   int_array :
     starts :
       data_type : int
@@ -3530,7 +3530,7 @@
     softmax : GetSoftmaxExpectedKernelType
     softmax_grad : GetSoftmaxGradExpectedKernelType
   extra :
-    attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+    attrs : [str data_format = "AnyLayout", bool use_cudnn = true, bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32", bool is_test = false]
 
 - op : softplus
   backward : softplus_grad, softplus_double_grad
@@ -3539,7 +3539,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : softshrink
   backward : softshrink_grad
@@ -3557,7 +3557,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : solve
   inputs :
@@ -3619,7 +3619,7 @@
       data_type : int
       support_tensor : true
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
 
 - op : split_with_num
   scalar :
@@ -3635,7 +3635,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : square
   backward : square_grad, square_double_grad (square_grad_grad)
@@ -3644,7 +3644,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : squeeze (squeeze2)
   backward : squeeze_grad (squeeze2_grad), squeeze_double_grad(squeeze2_double_grad)
@@ -3659,7 +3659,7 @@
       data_type : int
       support_tensor : true
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
     outputs : [xshape]
 
 - op : stack
@@ -3669,7 +3669,7 @@
   outputs :
     out : Y
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
   drop_empty_grad : [x_grad]
 
 - op : stanh
@@ -3716,7 +3716,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
   complex_promote : [X, Y]
 
@@ -3729,7 +3729,7 @@
   attrs:
     { axis : dim,  keepdim : keep_dim, dtype : out_dtype}
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str mkldnn_data_type = "float32"]
   int_array:
       axis :
         data_type : int
@@ -3753,7 +3753,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, float beta = 1.0]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, float beta = 1.0]
 
 - op : sync_batch_norm
   inputs :
@@ -3764,7 +3764,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool fuse_with_relu = false]
 
 - op : take_along_axis
   backward : take_along_axis_grad
@@ -3782,7 +3782,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : tanh
   backward : tanh_grad, tanh_double_grad (tanh_grad_grad), tanh_triple_grad
@@ -3791,7 +3791,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : tanh_shrink
   backward : tanh_shrink_grad
@@ -3800,7 +3800,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, bool use_cudnn = false]
 
 - op : tdm_child
   inputs :
@@ -3808,7 +3808,7 @@
   outputs :
     {child : Child, leaf_mask : LeafMask}
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : tdm_sampler
   inputs:
@@ -3872,7 +3872,7 @@
     perm : axis
   extra :
     outputs : [XShape]
-    attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false, str data_format = "AnyLayout", str mkldnn_data_type = "float32"]
 
 - op : triangular_solve
   backward : triangular_solve_grad
@@ -3897,7 +3897,7 @@
   attrs:
     data_format: data_layout
   extra :
-    attrs : [bool use_mkldnn = false]
+    attrs : [bool use_mkldnn = false, bool use_onednn = false]
 
 - op : trunc
   inputs :
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 9111fe8eda5af1..53680e172adcd6 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -122,6 +122,7 @@
     _pir_ops as _pir_ops,
     _typing as _typing,
     callbacks as callbacks,
+    compat as compat,
     fft as fft,
     hub as hub,
     linalg as linalg,
@@ -328,6 +329,7 @@
     masked_scatter_,
     moveaxis,
     put_along_axis,
+    ravel,
     repeat_interleave,
     reshape,
     reshape_,
@@ -579,6 +581,7 @@
     kthvalue,
     masked_select,
     mode,
+    msort,
     nonzero,
     searchsorted,
     sort,
@@ -879,6 +882,7 @@
     'summary',
     'flops',
     'sort',
+    'msort',
     'searchsorted',
     'bucketize',
     'split',
@@ -1092,6 +1096,7 @@
     'std',
     'flatten',
     'flatten_',
+    'ravel',
     'asin',
     'multiply',
     'multiply_',
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index b6ab11fad00a8c..dc434c2337f96b 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -325,6 +325,8 @@ def to_list(s):
         _switch_tracer,
         _test_enforce_gpu_success,
         _xpu_device_synchronize,
+        _xpu_get_current_stream,
+        _xpu_set_current_stream,
     )
 
     # isort: off
diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py
index bea78c7a528d9b..15270ea89e19b6 100644
--- a/python/paddle/base/dygraph/math_op_patch.py
+++ b/python/paddle/base/dygraph/math_op_patch.py
@@ -65,6 +65,36 @@
 _already_patch_eager_tensor = False
 
 
+_supported_dtype_conversions = {
+    # float
+    'float16': 'float16',
+    'half': 'float16',
+    'bfloat16': 'bfloat16',
+    'float32': 'float32',
+    'float': 'float32',
+    'float64': 'float64',
+    'double': 'float64',
+    # int
+    'int8': 'int8',
+    'char': 'int8',
+    # We handle uint8 conversion separately
+    # 'uint8': 'uint8',
+    # 'byte': 'uint8',
+    'int16': 'int16',
+    'short': 'int16',
+    'int32': 'int32',
+    'int': 'int32',
+    'int64': 'int64',
+    'long': 'int64',
+    # other
+    'bool': 'bool',
+    'complex64': 'complex64',
+    'complex128': 'complex128',
+    'cfloat': 'complex64',
+    'cdouble': 'complex128',
+}
+
+
 def monkey_patch_math_tensor():
     """
     Similar to monkey_patch_variable.
@@ -104,6 +134,44 @@ def astype(self: Tensor, dtype: DTypeLike) -> Tensor:
 
         return _C_ops.cast(self, dtype)
 
+    def byte(self: Tensor) -> Tensor:
+        # since paddle don't support float to uint8, so we need to convert it to int8 first
+        if self.is_floating_point():
+            tensor = astype(self, 'int8')
+            return astype(tensor, 'uint8')
+        elif self.is_complex():
+            real = astype(self.real(), 'int8')
+            return astype(real, 'uint8')
+        else:
+            return astype(self, 'uint8')
+
+    def _create_dtype_conversion_methods():
+        """
+        Batch create all data type conversion methods
+        """
+        methods = []
+
+        for method_name, target_dtype in _supported_dtype_conversions.items():
+
+            def make_conversion_method(dtype):
+                def conversion_method(self: Tensor) -> Tensor:
+                    return astype(self, dtype)
+
+                return conversion_method
+
+            method_impl = make_conversion_method(target_dtype)
+            method_impl.__name__ = method_name
+            method_impl.__doc__ = f"""
+            Cast a Tensor to {target_dtype} data type if it differs from the current dtype;
+            otherwise, return the original Tensor.
+            Returns:
+                Tensor: a new Tensor with {target_dtype} dtype
+            """
+
+            methods.append((method_name, method_impl))
+
+        return methods
+
     def _scalar_elementwise_op_(
         var: Tensor, scale: float, bias: float
     ) -> Tensor:
@@ -225,6 +293,8 @@ def _mT_(var: Tensor) -> Tensor:
         ('__len__', _len_),
         ('__index__', _index_),
         ('astype', astype),
+        ('byte', byte),
+        ('uint8', byte),
         ('dim', dim),
         ('ndimension', ndimension),
         ('ndim', _ndim),
@@ -235,6 +305,9 @@ def _mT_(var: Tensor) -> Tensor:
         ('__array_ufunc__', None),
     ]
 
+    dtype_conversion_methods = _create_dtype_conversion_methods()
+    eager_methods.extend(dtype_conversion_methods)
+
     eager_cpp_level_patch = [
         "__add__",
         "__radd__",
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
new file mode 100644
index 00000000000000..d42b733edccc80
--- /dev/null
+++ b/python/paddle/compat.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor.compat import (
+    split,
+)
+
+__all__ = [
+    'split',
+]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index f50db1c25393bf..a0e8264dcf70df 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -43,8 +43,12 @@
     from paddle._typing.device_like import PlaceLike
     from paddle.base.core import Place
 
-    _InitStreamBase = Union[core.CUDAStream, core.CustomDeviceStream]
-    _InitEventBase = Union[core.CUDAEvent, core.CustomDeviceEvent]
+    _InitStreamBase = Union[
+        core.CUDAStream, core.CustomDeviceStream, core.XPUStream
+    ]
+    _InitEventBase = Union[
+        core.CUDAEvent, core.CustomDeviceEvent, core.XPUEvent
+    ]
 
     from paddle import CUDAPlace, CustomPlace
     from paddle.base.libpaddle import _customDeviceProperties
@@ -983,6 +987,11 @@ def __init__(
             self.event_base = core.CUDAEvent(
                 enable_timing, blocking, interprocess
             )
+        elif paddle.is_compiled_with_xpu() and isinstance(
+            self.device, paddle.XPUPlace
+        ):
+            self.event_base = core.XPUEvent()
+
         elif isinstance(self.device, paddle.CustomPlace):
             self.event_base = core.CustomDeviceEvent(
                 self.device.get_device_type(),
@@ -1146,13 +1155,14 @@ def __init__(
     ) -> None:
         if stream_base is not None:
             if isinstance(
-                stream_base, (core.CUDAStream, core.CustomDeviceStream)
+                stream_base,
+                (core.CUDAStream, core.CustomDeviceStream, core.XPUStream),
             ):
                 self.stream_base = stream_base
                 self.device = stream_base.place
             else:
                 raise TypeError(
-                    "stream_base should be CUDAStream, CustomDeviceStream"
+                    "stream_base should be CUDAStream, XPUStream, CustomDeviceStream"
                 )
             return
 
@@ -1169,6 +1179,10 @@ def __init__(
             self.stream_base = core.CUDAStream(
                 self.device.get_device_id(), priority
             )
+        elif paddle.is_compiled_with_xpu() and isinstance(
+            self.device, paddle.XPUPlace
+        ):
+            self.stream_base = core.XPUStream(self.device.get_device_id())
         elif isinstance(self.device, paddle.CustomPlace):
             self.stream_base = core.CustomDeviceStream(
                 self.device.get_device_type(),
@@ -1314,6 +1328,8 @@ def synchronize(self) -> None:
     def _as_parameter_(self):
         if isinstance(self.stream_base, core.CUDAStream):
             return ctypes.c_void_p(self.stream_base.cuda_stream)
+        elif isinstance(self.stream_base, core.XPUStream):
+            return ctypes.c_void_p(self.stream_base.xpu_stream)
         else:
             return ctypes.c_void_p(self.stream_base.raw_stream)
 
@@ -1366,6 +1382,10 @@ def current_stream(device: PlaceLike | None = None) -> Stream:
         return Stream(
             stream_base=core._get_current_stream(place.get_device_id())
         )
+    elif paddle.is_compiled_with_xpu() and isinstance(place, paddle.XPUPlace):
+        return Stream(
+            stream_base=core._xpu_get_current_stream(place.get_device_id())
+        )
     elif isinstance(place, paddle.CustomPlace):
         return Stream(
             stream_base=core._get_current_custom_device_stream(
@@ -1409,6 +1429,10 @@ def set_stream(stream: Stream) -> Stream:
         stream.stream_base.place, paddle.CUDAPlace
     ):
         core._set_current_stream(stream.stream_base)
+    elif paddle.is_compiled_with_xpu() and isinstance(
+        stream.stream_base.place, paddle.XPUPlace
+    ):
+        core._xpu_set_current_stream(stream.stream_base.idx)
     elif isinstance(stream.stream_base.place, paddle.CustomPlace):
         core._set_current_custom_device_stream(
             stream.stream_base.place.get_device_type(),
diff --git a/python/paddle/device/xpu/__init__.py b/python/paddle/device/xpu/__init__.py
index 9a48a70e4a7f23..3840c173953dcd 100644
--- a/python/paddle/device/xpu/__init__.py
+++ b/python/paddle/device/xpu/__init__.py
@@ -20,6 +20,8 @@
 from paddle.base import core
 from paddle.utils import deprecated
 
+from .streams import Event, Stream
+
 if TYPE_CHECKING:
     from paddle import XPUPlace
 
@@ -30,6 +32,8 @@
     ]
 
 __all__ = [
+    'Stream',
+    'Event',
     'synchronize',
     'device_count',
     'set_debug_level',
@@ -45,6 +49,45 @@
 ]
 
 
+def current_stream(device: _XPUPlaceLike | None = None) -> core.XPUStream:
+    '''
+    Return the current XPU stream by the device.
+
+    Args:
+        device(paddle.XPUPlace()|int|None, optional): The device or the ID of the device which want to get stream from.
+                If device is None, the device is the current device. Default: None.
+
+    Returns:
+            XPUStream: the stream to the device.
+
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:XPU)
+            >>> import paddle
+            >>> paddle.device.set_device('xpu')
+
+            >>> s1 = paddle.device.xpu.current_stream()
+
+            >>> s2 = paddle.device.xpu.current_stream(0)
+
+            >>> s3 = paddle.device.xpu.current_stream(paddle.XPUPlace(0))
+
+    '''
+
+    device_id = -1
+
+    if device is not None:
+        if isinstance(device, int):
+            device_id = device
+        elif isinstance(device, core.XPUPlace):
+            device_id = device.get_device_id()
+        else:
+            raise ValueError("device type must be int or paddle.XPUPlace")
+
+    return core._xpu_get_current_stream(device_id)
+
+
 def extract_xpu_device_id(device: _XPUPlaceLike, op_name: str) -> int:
     '''
     Return the id of the given xpu device. It is just a utility that will not be exposed to users.
diff --git a/python/paddle/device/xpu/streams.py b/python/paddle/device/xpu/streams.py
new file mode 100644
index 00000000000000..b396c38890e59f
--- /dev/null
+++ b/python/paddle/device/xpu/streams.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.base.core import (  # noqa: F401
+    XPUEvent as Event,
+    XPUStream as Stream,
+)
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index c822bb83fafff9..e2e37bf83dd33c 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -59,6 +59,7 @@
     class_center_sample,
     cosine_similarity,
     dropout,
+    dropout1d,
     dropout2d,
     dropout3d,
     feature_alpha_dropout,
@@ -216,6 +217,7 @@
     'gumbel_softmax',
     'sequence_mask',
     'dropout',
+    'dropout1d',
     'dropout2d',
     'dropout3d',
     'alpha_dropout',
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 37cb95ab466089..14dd4a53642d54 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import math
+import warnings
 from typing import TYPE_CHECKING, Literal
 
 import numpy
@@ -1427,6 +1428,74 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             return ret
 
 
+def dropout1d(
+    input: paddle.Tensor,
+    p: float = 0.5,
+    training: bool = True,
+    inplace: bool = False,
+) -> paddle.Tensor:
+    """
+    Randomly zero out entire 1D channels (feature maps) during training.
+
+    Args:
+        input: Input tensor of shape [C, L] (2D) or [N, C, L] (3D)
+        p: Probability of a channel being zeroed. Default: 0.5
+        training: If False, returns input unchanged. Default: True
+        inplace: If True, modifies input tensor in-place. Default: False
+                WARNING: Currently not implemented (will behave as False).
+                TODO: Implement in-place operation in future versions.
+                Default: False
+
+    Returns:
+        Tensor with the same shape as input, where entire channels are zeroed with probability p
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            # Case 1: 3D input (batched)
+            >>> x = paddle.randn([2, 3, 10])  # [N, C, L]
+            >>> y_train = paddle.nn.functional.dropout1d(x, p=0.2)  # Training mode
+            >>> y_test = paddle.nn.functional.dropout1d(x, p=0.2, training=False)  # Test mode
+            >>> print("Original first channel:", x[0, 0, :])
+            >>> print("Train output (may be zeroed):", y_train[0, 0, :])
+            >>> print("Test output (always unchanged):", y_test[0, 0, :])
+
+            # Case 2: 2D input (single sample)
+            >>> x = paddle.randn([3, 8])  # [C, L]
+            >>> y = paddle.nn.functional.dropout1d(x, p=0.5)
+            >>> print("Input shape:", x.shape)
+            >>> print("Output shape:", y.shape)
+            >>> print("Zeroed channels count:", paddle.sum(y == 0).item())
+    """
+    if p < 0 or p > 1:
+        raise ValueError(f"dropout probability must be in [0, 1], got {p}")
+
+    ndim = input.ndim
+    if ndim not in [2, 3]:
+        raise RuntimeError(f"dropout1d expects 2D or 3D input, got {ndim}D")
+
+    if inplace:
+        warnings.warn(
+            "inplace=True is currently not supported in dropout1d and will be ignored. "
+            "This parameter is reserved for future implementation."
+        )
+        # TODO: Implement actual in-place operation when supported by dropout
+
+    need_squeeze = ndim == 2
+    if need_squeeze:
+        input = input.unsqueeze(0)  # [C, L] -> [1, C, L]
+
+    # Apply dropout along channel dimension
+    result = dropout(input, p=p, axis=1, training=training)
+
+    if need_squeeze:
+        result = result.squeeze(0)  # [1, C, L] -> [C, L]
+
+    return result
+
+
 def dropout2d(
     x: Tensor,
     p: float = 0.5,
diff --git a/python/paddle/pir/math_op_patch.py b/python/paddle/pir/math_op_patch.py
index f9f8bfcf733616..4712433d948768 100644
--- a/python/paddle/pir/math_op_patch.py
+++ b/python/paddle/pir/math_op_patch.py
@@ -37,6 +37,35 @@
     DataType.INT64,
 ]
 
+_supported_dtype_conversions = {
+    # float
+    'float16': 'float16',
+    'half': 'float16',
+    'bfloat16': 'bfloat16',
+    'float32': 'float32',
+    'float': 'float32',
+    'float64': 'float64',
+    'double': 'float64',
+    # int
+    'int8': 'int8',
+    'char': 'int8',
+    # We handle uint8 conversion separately
+    # 'uint8': 'uint8',
+    # 'byte': 'uint8',
+    'int16': 'int16',
+    'short': 'int16',
+    'int32': 'int32',
+    'int': 'int32',
+    'int64': 'int64',
+    'long': 'int64',
+    # other
+    'bool': 'bool',
+    'complex64': 'complex64',
+    'complex128': 'complex128',
+    'cfloat': 'complex64',
+    'cdouble': 'complex128',
+}
+
 SUPPORT_PROMOTION_OPS = [
     "__add__",
     "__radd__",
@@ -370,6 +399,41 @@ def astype(self, dtype):
 
         return _C_ops.cast(self, dtype)
 
+    def byte(self):
+        # since paddle don't support float to uint8, so we need to convert it to int8 first
+        if self.is_floating_point():
+            tensor = astype(self, 'int8')
+            return astype(tensor, 'uint8')
+        elif self.is_complex():
+            real = astype(self.real(), 'int8')
+            return astype(real, 'uint8')
+        else:
+            return astype(self, 'uint8')
+
+    def _create_dtype_conversion_methods():
+        """
+        Batch create all data type conversion methods
+        """
+        methods = []
+        for method_name, target_dtype in _supported_dtype_conversions.items():
+
+            def make_conversion_method(dtype):
+                def conversion_method(self):
+                    return astype(self, dtype)
+
+                return conversion_method
+
+            method_impl = make_conversion_method(target_dtype)
+            method_impl.__name__ = method_name
+            method_impl.__doc__ = f"""
+            Cast a Value to {target_dtype} data type if it differs from the current dtype;
+            otherwise, return the original Value.
+            Returns:
+                Value: a new Value with {target_dtype} dtype
+            """
+            methods.append((method_name, method_impl))
+        return methods
+
     def _scalar_add_(var, value):
         return paddle.scale(var, 1.0, value)
 
@@ -1109,6 +1173,8 @@ def register_hook(self, hook):
         ('ndimension', ndimension),
         ('ndim', _ndim),
         ('astype', astype),
+        ('byte', byte),
+        ('uint8', byte),
         ('size', _size_),
         ('T', _T_),
         ('mT', _mT_),
@@ -1253,6 +1319,8 @@ def register_hook(self, hook):
         ('__bool__', _bool_),
         ('__complex__', _complex_),
     ]
+    dtype_conversion_methods = _create_dtype_conversion_methods()
+    value_methods.extend(dtype_conversion_methods)
 
     global _already_patch_value
     if not _already_patch_value:
diff --git a/python/paddle/static/quantization/__init__.py b/python/paddle/static/quantization/__init__.py
index b04cf7fbb7a297..48b6f518ec67ee 100644
--- a/python/paddle/static/quantization/__init__.py
+++ b/python/paddle/static/quantization/__init__.py
@@ -19,9 +19,11 @@
 )
 from .quant2_int8_onednn_pass import (  # noqa: F401
     Quant2Int8MkldnnPass,
+    Quant2Int8OnednnPass,
 )
 from .quant_int8_onednn_pass import (  # noqa: F401
     QuantInt8MkldnnPass,
+    QuantInt8OnednnPass,
 )
 from .quanter import (  # noqa: F401
     convert,
diff --git a/python/paddle/static/quantization/quant2_int8_onednn_pass.py b/python/paddle/static/quantization/quant2_int8_onednn_pass.py
index 72b505d44a6054..966bd511c8df08 100644
--- a/python/paddle/static/quantization/quant2_int8_onednn_pass.py
+++ b/python/paddle/static/quantization/quant2_int8_onednn_pass.py
@@ -14,13 +14,15 @@
 
 import numpy as np
 
+from paddle.utils import deprecated
+
 from ...base.framework import IrGraph
 from ...framework import _get_paddle_place, core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 
 
-class Quant2Int8MkldnnPass:
+class Quant2Int8OnednnPass:
     """
     Transform a quant model IrGraph into MKL-DNN supported INT8 IrGraph.
     The pass consists of the following transformations:
@@ -429,7 +431,7 @@ def _optimize_fp32_graph(self, graph):
         graph = self._update_activations(graph)
         graph = self._remove_ctrl_vars(graph)
         graph = self._apply_pass(
-            graph, 'onednn_placement_pass', ['mkldnn_enabled_op_types'], [set()]
+            graph, 'onednn_placement_pass', ['onednn_enabled_op_types'], [set()]
         )
         # remove dropout ops
         graph = self._apply_pass(graph, 'simplify_with_basic_ops_pass')
@@ -721,3 +723,14 @@ def _quantize_fp32_graph(self, graph):
         graph = self._apply_pass(graph, 'int8_scale_calculation_onednn_pass')
         graph = self._apply_pass(graph, 'params_quantization_onednn_pass')
         return graph
+
+
+class Quant2Int8MkldnnPass(Quant2Int8OnednnPass):
+    @deprecated(
+        since="3.1.0",
+        update_to="paddle.static.quantization.Quant2Int8OnednnPass",
+        level=1,
+        reason="Quant2Int8MkldnnPass will be removed in future",
+    )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
diff --git a/python/paddle/static/quantization/quant_int8_onednn_pass.py b/python/paddle/static/quantization/quant_int8_onednn_pass.py
index ad706837e0653e..2387e8bd9b70f7 100644
--- a/python/paddle/static/quantization/quant_int8_onednn_pass.py
+++ b/python/paddle/static/quantization/quant_int8_onednn_pass.py
@@ -14,11 +14,13 @@
 
 import numpy as np
 
+from paddle.utils import deprecated
+
 from ...base.framework import IrGraph
 from ...framework import _get_paddle_place
 
 
-class QuantInt8MkldnnPass:
+class QuantInt8OnednnPass:
     """
     Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8
     IrGraph. Following transformations did in this pass:
@@ -48,13 +50,13 @@ def __init__(self, _scope=None, _place=None):
                 >>> # The original graph will be rewrite.
                 >>> import paddle
                 >>> from paddle import static
-                >>> from paddle.static.quantization import QuantInt8MkldnnPass
+                >>> from paddle.static.quantization import QuantInt8OnednnPass
                 >>> from paddle.framework import IrGraph
                 >>> from paddle.framework import core
 
                 >>> graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
                 >>> place = paddle.CPUPlace()
-                >>> onednn_pass = QuantInt8MkldnnPass(static.global_scope(), place)
+                >>> onednn_pass = QuantInt8OnednnPass(static.global_scope(), place)
                 >>> onednn_pass.apply(graph)
         """
 
@@ -245,7 +247,7 @@ def _transform_to_quantize_onednn(self, graph, op_node):
         quant_op_node = graph.create_op_node(
             op_type='quantize',
             attrs={
-                'data_format': 'MKLDNNLAYOUT',
+                'data_format': 'ONEDNNLAYOUT',
                 'use_mkldnn': 1,
                 'Scale': scale_in,
                 'is_negative_input': 1,
@@ -287,3 +289,14 @@ def _remove_unused_var_nodes(self, graph):
             )
         )
         graph.safe_remove_nodes(all_unused_vars)
+
+
+class QuantInt8MkldnnPass(QuantInt8OnednnPass):
+    @deprecated(
+        since="3.1.0",
+        update_to="paddle.static.quantization.QuantInt8OnednnPass",
+        level=1,
+        reason="QuantInt8MkldnnPass will be removed in future",
+    )
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 75d2882a04006f..32425a36ee145d 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -193,6 +193,7 @@
     moveaxis,
     put_along_axis,
     put_along_axis_,
+    ravel,
     repeat_interleave,
     reshape,
     reshape_,
@@ -459,6 +460,7 @@
     kthvalue,
     masked_select,
     mode,
+    msort,
     nonzero,
     searchsorted,
     sort,
@@ -726,6 +728,7 @@
     'index_select',
     'nonzero',
     'sort',
+    'msort',
     'index_sample',
     'mean',
     'std',
diff --git a/python/paddle/tensor/compat.py b/python/paddle/tensor/compat.py
new file mode 100644
index 00000000000000..a6a755b7025203
--- /dev/null
+++ b/python/paddle/tensor/compat.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import paddle
+from paddle import _C_ops
+
+from ..base.framework import Variable
+from ..framework import (
+    in_dynamic_mode,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from paddle import Tensor
+
+from paddle.utils.decorator_utils import ForbidKeywordsDecorator
+
+__all__ = []
+
+
+@ForbidKeywordsDecorator(
+    illegal_keys=["x", "num_or_sections", "axis", "name"],
+    func_name="paddle.compat.split",
+    correct_name="paddle.split",
+)
+def split(
+    tensor: Tensor, split_size_or_sections: int | Sequence[int], dim: int = 0
+) -> tuple[Tensor, ...]:
+    """
+    (PyTorch Compatible API) Split the input tensor into multiple sub-Tensors.
+
+    Args:
+        tensor (Tensor): A N-D Tensor. The data type is bool, bfloat16, float16, float32, float64, uint8, int8, int32 or int64.
+        split_size_or_sections (int|list|tuple):
+            If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible).
+            Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size.
+            If split_size_or_sections is a list, then tensor will be split into len(split_size_or_sections) chunks with sizes
+            in dim according to split_size_or_sections. Negative inputs are not allowed. For example: for a dim with 9 channels,
+            [2, 3, -1] will not be interpreted as [2, 3, 4], but will be rejected and an exception will be thrown.
+        dim (int|Tensor, optional): The dim along which to split, it can be a integer or a ``0-D Tensor``
+            with shape [] and data type  ``int32`` or ``int64``.
+            If :math::`dim < 0`, the dim to split along is :math:`rank(x) + dim`. Default is 0.
+    Returns:
+        tuple(Tensor), The tuple of segmented Tensors.
+
+    Note:
+        This is a pytorch compatible API that follows the function signature and behavior of torch.split.
+        To use the original split of paddle, please consider `paddle.split`
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> # x is a Tensor of shape [3, 8, 5]
+            >>> x = paddle.rand([3, 8, 5])
+
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=1)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 2, 5]
+
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=[1, 2, 5], dim=1)
+            >>> print(out0.shape)
+            [3, 1, 5]
+            >>> print(out1.shape)
+            [3, 2, 5]
+            >>> print(out2.shape)
+            [3, 5, 5]
+
+            >>> # dim is negative, the real dim is (rank(x) + dim)=1
+            >>> out0, out1, out2 = paddle.compat.split(x, split_size_or_sections=3, dim=-2)
+            >>> print(out0.shape)
+            [3, 3, 5]
+            >>> print(out1.shape)
+            [3, 3, 5]
+            >>> print(out2.shape)
+            [3, 2, 5]
+    """
+
+    def GetSplitSize(split_size, shape_on_dim):
+        remaining_num = shape_on_dim % split_size_or_sections
+        num_complete_section = shape_on_dim // split_size_or_sections
+        if remaining_num == 0:
+            return num_complete_section
+        else:
+            sections = [
+                split_size_or_sections for _ in range(num_complete_section)
+            ]
+            sections.append(remaining_num)
+            return sections
+
+    def GetShapeOnDimInRange(shape, dim: int) -> int:
+        shape_range = len(shape)
+        if isinstance(dim, int):
+            if dim < -shape_range or dim >= shape_range:
+                raise ValueError(
+                    f"(InvalidArgument) The dim is expected to be in range of [-{shape_range}, {shape_range}), but got {dim}"
+                )
+        return shape[dim]
+
+    if isinstance(split_size_or_sections, (list, tuple)):
+        for i, section_size in enumerate(split_size_or_sections):
+            shape_val = 0
+            if isinstance(section_size, Variable):
+                shape_val = int(section_size.item(0))
+            else:
+                shape_val = section_size
+            if section_size < 0:
+                raise ValueError(
+                    f"paddle.compat.split expects split_sizes have only non-negative entries, but got size = {section_size} on dim {i}"
+                )
+
+    if in_dynamic_mode():
+        if isinstance(dim, Variable):
+            dim = dim.item(0)
+        assert dim + len(tensor.shape) >= 0, "(rank(x) + dim) must >= 0"
+        dim = (dim + len(tensor.shape)) if dim < 0 else dim
+
+        if isinstance(split_size_or_sections, (list, tuple)):
+            if paddle.utils._contain_var(split_size_or_sections):
+                for index, item in enumerate(split_size_or_sections):
+                    if isinstance(item, Variable):
+                        split_size_or_sections[index] = split_size_or_sections[
+                            index
+                        ].item()
+        elif not isinstance(split_size_or_sections, int):
+            raise TypeError(
+                "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but "
+                f"received {type(split_size_or_sections)}."
+            )
+
+        if isinstance(split_size_or_sections, int):
+            # check whether shape is divisible
+            assert (
+                split_size_or_sections > 0
+            ), 'split_size_or_sections must be greater than 0.'
+
+            split_size_or_sections = GetSplitSize(
+                split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim)
+            )
+
+            if isinstance(split_size_or_sections, list):
+                return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+            else:
+                return tuple(
+                    _C_ops.split_with_num(tensor, split_size_or_sections, dim)
+                )
+        else:
+            return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+    else:
+        if isinstance(dim, paddle.pir.Value):
+            raise TypeError(
+                "'dim' is not allowed to be a pir.Value in a static graph: "
+                "\npir.Value can not be used for indexing python lists/tuples."
+            )
+        if isinstance(dim, int):
+            assert len(tensor.shape) + dim >= 0, "(rank(x) + dim) must >= 0"
+            dim = (len(tensor.shape) + dim) if dim < 0 else dim
+
+        input_shape = tensor.shape
+
+        if not isinstance(split_size_or_sections, (int, list, tuple)):
+            raise TypeError(
+                "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode."
+            )
+        if isinstance(split_size_or_sections, int):
+            assert (
+                split_size_or_sections > 0
+            ), 'split_size_or_sections must be greater than 0.'
+
+            split_size_or_sections = GetSplitSize(
+                split_size_or_sections, GetShapeOnDimInRange(tensor.shape, dim)
+            )
+            if isinstance(split_size_or_sections, list):
+                if paddle.utils._contain_var(split_size_or_sections):
+                    split_size_or_sections = paddle.utils.get_int_tensor_list(
+                        split_size_or_sections
+                    )
+                return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
+            else:
+                return tuple(
+                    _C_ops.split_with_num(tensor, split_size_or_sections, dim)
+                )
+        else:
+            if isinstance(dim, int) and input_shape[dim] > 0:
+                assert (
+                    len(split_size_or_sections) <= input_shape[dim]
+                ), 'len(split_size_or_sections) must not be more than input.shape[dim].'
+            if paddle.utils._contain_var(split_size_or_sections):
+                split_size_or_sections = paddle.utils.get_int_tensor_list(
+                    split_size_or_sections
+                )
+            return tuple(_C_ops.split(tensor, split_size_or_sections, dim))
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 6f03d03d47c1d3..2014603dff6ca6 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -24,6 +24,7 @@
 import paddle
 from paddle import _C_ops
 from paddle.tensor import fill_constant
+from paddle.utils.decorator_utils import ParamAliasDecorator
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ..base.data_feeder import (
@@ -57,6 +58,8 @@
         TensorOrTensors,
     )
 
+from paddle.utils.decorator_utils import ForbidKeywordsDecorator
+
 __all__ = []
 
 
@@ -1991,6 +1994,46 @@ def flatten(
         return out
 
 
+def ravel(input: Tensor) -> Tensor:
+    """
+    Return a contiguous flattened tensor. A copy is made only if needed.
+    Note:
+        The output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode.
+        If you want to use the Tensor copy version, please use `Tensor.clone` like ``ravel_clone_x = x.ravel().clone()``.
+        For example:
+
+        .. code-block:: text
+            Case 1:
+              Given
+                X.shape = (3, 100, 100, 4)
+
+              We get:
+                Out.shape = (3 * 100 * 100 * 4)
+    Args:
+        x (Tensor): A tensor of number of dimensions >= axis. A tensor with data type float16, float32,
+                      float64, int8, int32, int64, uint8.
+
+    Returns:
+        Tensor: A tensor with the contents of the input tensor, whose input axes are flattened by indicated :attr:`start_axis` and :attr:`stop_axis`, and data type is the same as input :attr:`x`.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> image_shape=(2, 3, 4, 4)
+
+            >>> x = paddle.arange(end=image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3])
+            >>> img = paddle.reshape(x, image_shape)
+
+            >>> out = paddle.ravel(img)
+            >>> print(out.shape)
+            [96]
+    """
+    return flatten(input)
+
+
 @inplace_apis_in_dygraph_only
 def flatten_(
     x: Tensor, start_axis: int = 0, stop_axis: int = -1, name: str | None = None
@@ -2682,6 +2725,11 @@ def row_stack(x: Sequence[Tensor], name: str | None = None) -> Tensor:
     return paddle.vstack(x, name=name)
 
 
+@ForbidKeywordsDecorator(
+    illegal_keys=["tensor", "split_size_or_sections", "dim"],
+    func_name="paddle.split",
+    correct_name="paddle.compat.split",
+)
 def split(
     x: Tensor,
     num_or_sections: int | Sequence[int],
@@ -4762,6 +4810,7 @@ def expand_as(x: Tensor, y: Tensor, name: str | None = None) -> Tensor:
         return out
 
 
+@ParamAliasDecorator({"x": ["input"], "shape": ["size"]})
 def broadcast_to(
     x: Tensor, shape: ShapeLike, name: str | None = None
 ) -> Tensor:
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 3837d7595f8cc7..6b91b36f40fa3a 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -676,6 +676,44 @@ def sort(
         return out
 
 
+def msort(input: Tensor) -> Tensor:
+    """
+
+    Sorts the input along the given axis = 0, and returns the sorted output tensor. The sort algorithm is ascending.
+
+    Args:
+        input (Tensor): An input N-D Tensor with type float32, float64, int16,
+            int32, int64, uint8.
+
+    Returns:
+        Tensor, sorted tensor(with the same shape and data type as ``input``).
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+
+            >>> x = paddle.to_tensor([[[5,8,9,5],
+            ...                        [0,0,1,7],
+            ...                        [6,9,2,4]],
+            ...                       [[5,2,4,2],
+            ...                        [4,7,7,9],
+            ...                        [1,7,0,6]]],
+            ...                      dtype='float32')
+            >>> out1 = paddle.msort(input=x)
+            >>> print(out1.numpy())
+            [[[5. 2. 4. 2.]
+              [0. 0. 1. 7.]
+              [1. 7. 0. 4.]]
+             [[5. 8. 9. 5.]
+              [4. 7. 7. 9.]
+              [6. 9. 2. 6.]]]
+    """
+
+    return sort(input, axis=0)
+
+
 def mode(
     x: Tensor, axis: int = -1, keepdim: bool = False, name: str | None = None
 ) -> tuple[Tensor, Tensor]:
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 061568cc0ce9a1..3fbcf6af86b4df 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -15,6 +15,7 @@
 from ..base.framework import require_version
 from . import (  # noqa: F401
     cpp_extension,
+    decorator_utils,
     dlpack,
     download,
     image_util,
diff --git a/python/paddle/utils/decorator_utils.py b/python/paddle/utils/decorator_utils.py
new file mode 100644
index 00000000000000..4eb62a32602fb0
--- /dev/null
+++ b/python/paddle/utils/decorator_utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import functools
+import inspect
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Generic,
+    TypeVar,
+    cast,
+)
+
+from typing_extensions import ParamSpec
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+_DecoratedFunc = Callable[_P, _R]
+
+
+class DecoratorBase(Generic[_P, _R]):
+    """Decorative base class, providing a universal decorative framework.
+
+    Subclass only needs to implement the 'process' method to define the core logic.
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        """Initialize decorator parameters"""
+        self.args = args
+        self.kwargs = kwargs
+
+    def __call__(self, func: _DecoratedFunc[_P, _R]) -> _DecoratedFunc[_P, _R]:
+        """As an entry point for decorative applications"""
+
+        @functools.wraps(func)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+            # Pretreatment parameters
+            processed_args, processed_kwargs = self.process(args, kwargs)
+            # Call the original function
+            return func(*processed_args, **processed_kwargs)
+
+        # Keep original signature
+        wrapper.__signature__ = inspect.signature(func)
+        return cast("_DecoratedFunc[_P, _R]", wrapper)
+
+    def process(
+        self, args: tuple[Any, ...], kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        """Core processing methods that subclasses must implement.
+
+        Args:
+            args: positional parameter
+            kwargs: Keyword Argument
+
+        Returns:
+            Processed tuples (args, kwargs)
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+
+
+# Example implementation: Parameter alias decorator
+class ParamAliasDecorator(DecoratorBase[_P, _R]):
+    """Implementation of Decorator for Parameter Alias Processing"""
+
+    def __init__(self, alias_mapping: dict[str, Iterable[str]]) -> None:
+        super().__init__()
+        if not isinstance(alias_mapping, dict):
+            raise TypeError("alias_mapping must be a dictionary")
+        for k, v in alias_mapping.items():
+            if not isinstance(v, (list, tuple, set)):
+                raise TypeError(f"Aliases for '{k}' must be iterable")
+        self.alias_mapping = alias_mapping
+
+    def process(
+        self, args: tuple[Any, ...], kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        if not kwargs:
+            return args, kwargs
+        processed_kwargs = kwargs.copy()
+        for original, aliases in self.alias_mapping.items():
+            for alias in aliases:
+                if alias in processed_kwargs:
+                    if original not in processed_kwargs:
+                        processed_kwargs[original] = processed_kwargs.pop(alias)
+                    else:
+                        raise ValueError(
+                            f"Cannot specify both '{original}' and its alias '{alias}'"
+                        )
+        return args, processed_kwargs
+
+
+class ForbidKeywordsDecorator(DecoratorBase[_P, _R]):
+    """A decorator that hints users to use the correct `compat` functions, when erroneous keyword arguments are detected"""
+
+    def __init__(
+        self, illegal_keys: list[str] | str, func_name: str, correct_name: str
+    ) -> None:
+        super().__init__()
+        self.illegal_keys = (
+            [illegal_keys] if isinstance(illegal_keys, str) else illegal_keys
+        )
+        self.func_name = func_name
+        self.correct_name = correct_name
+
+    def process(
+        self, args: tuple[Any, ...], kwargs: dict[str, Any]
+    ) -> tuple[tuple[Any, ...], dict[str, Any]]:
+        found_keys = [key for key in self.illegal_keys if key in kwargs]
+
+        if found_keys:
+            keys_str = ", ".join(f"'{key}'" for key in found_keys)
+            plural = "s" if len(found_keys) > 1 else ""
+
+            raise TypeError(
+                f"{self.func_name}() received unexpected keyword argument{plural} {keys_str}. "
+                f"\nDid you mean to use {self.correct_name}() instead?"
+            )
+        return args, kwargs
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index 80d639d052ba1d..67dd46b8b52335 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -156,6 +156,7 @@ if '${WITH_GPU}' == 'ON':
 
 if '${WITH_ROCM}' == 'ON':
     cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/hip/cinn_hip_runtime_source.h')
+    cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/hip/float16.h')
 
 if '${CINN_WITH_SYCL}' == 'ON':
     cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/sycl/cinn_sycl_runtime_source.h')
diff --git a/test/cpp/inference/api/config_printer.h b/test/cpp/inference/api/config_printer.h
index e1b1405a397208..d14a67828a5c1d 100644
--- a/test/cpp/inference/api/config_printer.h
+++ b/test/cpp/inference/api/config_printer.h
@@ -75,7 +75,7 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
      << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   os << GenSpaces(num_spaces)
      << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
-  os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
+  os << GenSpaces(num_spaces) << "use_onednn: " << config.onednn_enabled()
      << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
diff --git a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
index e94453a9598855..38978395b5ac7c 100644
--- a/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/deprecated/cpp/inference/api/analysis_predictor_tester.cc
@@ -335,7 +335,7 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
   passStrategy.EnableMkldnnBfloat16();
 }
 
-TEST(AnalysisPredictor, mkldnn_fc_pass_strategy) {
+TEST(AnalysisPredictor, onednn_fc_pass_strategy) {
   std::vector<std::string> passes;
   PassStrategy passStrategy(passes);
   passStrategy.DisableOnednnFcPasses();
@@ -343,7 +343,7 @@ TEST(AnalysisPredictor, mkldnn_fc_pass_strategy) {
 }
 
 #ifdef PADDLE_WITH_DNNL
-TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
+TEST(AnalysisPredictor, onednn_fc_passes_cpu_pass_strategy) {
   CpuPassStrategy cpuPassStrategy;
   cpuPassStrategy.EnableONEDNN();
   const std::vector<std::string> fc_passes_to_erase(
@@ -359,15 +359,15 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-TEST(AnalysisPredictor, mkldnn_fc_passes_gpu_pass_strategy) {
+TEST(AnalysisPredictor, onednn_fc_passes_gpu_pass_strategy) {
   AnalysisConfig config;
   config.EnableUseGpu(100, 0);
   config.EnableONEDNN();
   config.DisableOnednnFcPasses();
 #ifdef PADDLE_WITH_DNNL
-  ASSERT_TRUE(config.mkldnn_fc_passes_disabled());
+  ASSERT_TRUE(config.onednn_fc_passes_disabled());
 #else
-  ASSERT_FALSE(config.mkldnn_fc_passes_disabled());
+  ASSERT_FALSE(config.onednn_fc_passes_disabled());
 #endif
 }
 #endif
diff --git a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
index fc85ae1b10f7e2..ec10b780a35eeb 100644
--- a/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
+++ b/test/deprecated/cpp/inference/api/analyzer_bert_tester_deprecated.cc
@@ -22,13 +22,13 @@ namespace inference {
 
 using paddle::PaddleTensor;
 
-void profile(bool use_mkldnn = false, bool use_bfloat16 = false);
+void profile(bool use_onednn = false, bool use_bfloat16 = false);
 std::vector<std::vector<paddle::PaddleTensor>> LoadInputData();
-void CompareNativeAndAnalysisWrapper(bool use_mkldnn = false);
+void CompareNativeAndAnalysisWrapper(bool use_onednn = false);
 std::vector<paddle::PaddleTensor> ParseInputStreamToVector(
     const std::string &line);
 
-AnalysisConfig SetConfig(bool use_mkldnn = false, bool use_bfloat16 = false);
+AnalysisConfig SetConfig(bool use_onednn = false, bool use_bfloat16 = false);
 
 template <typename T>
 paddle::PaddleTensor ParseTensor(const std::string &field);
@@ -50,15 +50,15 @@ TEST(Analyzer_bert, profile) {
 }
 
 #ifdef PADDLE_WITH_DNNL
-TEST(Analyzer_bert, profile_mkldnn) {
-  auto use_mkldnn = true;
-  profile(use_mkldnn);
+TEST(Analyzer_bert, profile_onednn) {
+  auto use_onednn = true;
+  profile(use_onednn);
 }
 
-TEST(Analyzer_bert, profile_mkldnn_bf16) {
-  auto use_mkldnn = true;
+TEST(Analyzer_bert, profile_onednn_bf16) {
+  auto use_onednn = true;
   auto use_bfloat16 = true;
-  profile(use_mkldnn, use_bfloat16);
+  profile(use_onednn, use_bfloat16);
 }
 #endif
 
@@ -70,8 +70,8 @@ TEST(Analyzer_bert, compare) {
 }
 #ifdef PADDLE_WITH_DNNL
 TEST(Analyzer_bert, compare_mkldnn) {
-  auto use_mkldnn = true;
-  CompareNativeAndAnalysisWrapper(use_mkldnn);
+  auto use_onednn = true;
+  CompareNativeAndAnalysisWrapper(use_onednn);
 }
 #endif
 
@@ -135,8 +135,8 @@ TEST(Analyzer_bert, transfer_scope_cache) {
           "The size of data cache is not equal to thread number."));
 }
 
-void profile(bool use_mkldnn, bool use_bfloat16) {
-  auto config(SetConfig(use_mkldnn, use_bfloat16));
+void profile(bool use_onednn, bool use_bfloat16) {
+  auto config(SetConfig(use_onednn, use_bfloat16));
   std::vector<std::vector<PaddleTensor>> outputs;
   auto inputs = LoadInputData();
   TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&config),
@@ -168,8 +168,8 @@ std::vector<std::vector<paddle::PaddleTensor>> LoadInputData() {
   return inputs;
 }
 
-void CompareNativeAndAnalysisWrapper(bool use_mkldnn) {
-  auto cfg(SetConfig(use_mkldnn));
+void CompareNativeAndAnalysisWrapper(bool use_onednn) {
+  auto cfg(SetConfig(use_onednn));
   auto inputs = LoadInputData();
   CompareNativeAndAnalysis(
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), inputs);
@@ -201,12 +201,12 @@ std::vector<paddle::PaddleTensor> ParseInputStreamToVector(
   return tensors;
 }
 
-AnalysisConfig SetConfig(bool use_mkldnn, bool use_bfloat16) {
+AnalysisConfig SetConfig(bool use_onednn, bool use_bfloat16) {
   AnalysisConfig config;
   config.SetModel(FLAGS_infer_model);
   config.DisableFCPadding();
 
-  if (use_mkldnn) {
+  if (use_onednn) {
     config.EnableONEDNN();
   }
 
diff --git a/test/deprecated/ir/inference/auto_scan_test.py b/test/deprecated/ir/inference/auto_scan_test.py
index 041f84d50b804b..752b5f32d011ba 100755
--- a/test/deprecated/ir/inference/auto_scan_test.py
+++ b/test/deprecated/ir/inference/auto_scan_test.py
@@ -239,7 +239,7 @@ def create_inference_config(
         if use_gpu:
             config.enable_use_gpu(100, 0)
         if not use_mkldnn:
-            config.disable_mkldnn()
+            config.disable_onednn()
         if use_xpu:
             config.enable_xpu()
         if passes is not None:
@@ -248,7 +248,7 @@ def create_inference_config(
         return config
 
 
-class MkldnnAutoScanTest(AutoScanTest):
+class OnednnAutoScanTest(AutoScanTest):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -336,14 +336,14 @@ def run_test(self, quant=False, *args, **kwargs):
 
     def inference_config_str(self, config) -> str:
         dic = {}
-        enable_mkldnn = config.mkldnn_enabled()
-        dic["use_mkldnn"] = enable_mkldnn
+        enable_onednn = config.onednn_enabled()
+        dic["use_mkldnn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic["use_gpu"] = enable_gpu
         return str(dic)
 
 
-class PirMkldnnAutoScanTest(MkldnnAutoScanTest):
+class PirOnednnAutoScanTest(OnednnAutoScanTest):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -572,8 +572,8 @@ def run_test(self, quant=False, prog_configs=None):
 
     def inference_config_str(self, config) -> str:
         dic = {}
-        enable_mkldnn = config.mkldnn_enabled()
-        dic["use_mkldnn"] = enable_mkldnn
+        enable_onednn = config.onednn_enabled()
+        dic["use_mkldnn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic['use_gpu'] = enable_gpu
         enable_xpu = config.use_xpu()
diff --git a/test/deprecated/ir/inference/inference_pass_test.py b/test/deprecated/ir/inference/inference_pass_test.py
index 958fd0d4571d29..739716382f50bd 100644
--- a/test/deprecated/ir/inference/inference_pass_test.py
+++ b/test/deprecated/ir/inference/inference_pass_test.py
@@ -38,7 +38,7 @@ def __init__(self, methodName='runTest'):
         self.fetch_list = None
 
         self.enable_mkldnn = False
-        self.enable_mkldnn_bfloat16 = False
+        self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = False
         self.trt_parameters = None
@@ -143,7 +143,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
-        config.disable_mkldnn()
+        config.disable_onednn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)
@@ -178,9 +178,9 @@ def _get_analysis_config(
                     config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
-            config.enable_mkldnn()
-            if self.enable_mkldnn_bfloat16:
-                config.enable_mkldnn_bfloat16()
+            config.enable_onednn()
+            if self.enable_onednn_bfloat16:
+                config.enable_onednn_bfloat16()
         return config
 
     def check_output(self, atol=1e-3):
@@ -285,23 +285,23 @@ def check_output_with_option(
 
         # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
-            mkldnn_outputs = self._get_inference_outs(
+            onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
                     use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
-                len(paddle_outs) == len(mkldnn_outputs),
+                len(paddle_outs) == len(onednn_outputs),
                 "The number of outputs is different between CPU and MKLDNN. ",
             )
 
-            if self.enable_mkldnn_bfloat16:
+            if self.enable_onednn_bfloat16:
                 atol = 0.01
-            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
+            for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs):
                 np.testing.assert_allclose(
                     np.array(paddle_out),
-                    mkldnn_output,
+                    onednn_output,
                     rtol=1e-05,
                     atol=atol,
                     err_msg='Output has diff between CPU and MKLDNN. ',
diff --git a/test/deprecated/ir/inference/quant_dequant_test.py b/test/deprecated/ir/inference/quant_dequant_test.py
index 725f0948266dd3..69f2ddfaaa4fda 100644
--- a/test/deprecated/ir/inference/quant_dequant_test.py
+++ b/test/deprecated/ir/inference/quant_dequant_test.py
@@ -47,7 +47,7 @@ def __init__(self, methodName='runTest'):
         self.feeds = None
         self.fetch_list = None
         self.enable_mkldnn = False
-        self.enable_mkldnn_bfloat16 = False
+        self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = True
         self.trt_parameters = None
@@ -204,7 +204,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
-        config.disable_mkldnn()
+        config.disable_onednn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)
@@ -231,9 +231,9 @@ def _get_analysis_config(
                     config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
-            config.enable_mkldnn()
-            if self.enable_mkldnn_bfloat16:
-                config.enable_mkldnn_bfloat16()
+            config.enable_onednn()
+            if self.enable_onednn_bfloat16:
+                config.enable_onednn_bfloat16()
         return config
 
     def check_output_with_option(
@@ -388,23 +388,23 @@ def check_output_with_option(
 
         # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
-            mkldnn_outputs = self._get_inference_outs(
+            onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
                     use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
-                len(paddle_outs) == len(mkldnn_outputs),
+                len(paddle_outs) == len(onednn_outputs),
                 "The number of outputs is different between CPU and MKLDNN. ",
             )
 
-            if self.enable_mkldnn_bfloat16:
+            if self.enable_onednn_bfloat16:
                 atol = 0.01
-            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
+            for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs):
                 np.testing.assert_allclose(
                     np.array(paddle_out),
-                    mkldnn_output,
+                    onednn_output,
                     rtol=1e-05,
                     atol=atol,
                     err_msg='Output has diff between CPU and MKLDNN. ',
diff --git a/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py b/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py
index 7a8dc3b1a235f9..5f2f954479678a 100644
--- a/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py
+++ b/test/deprecated/ir/inference/test_trt_inference_predictor_deprecated.py
@@ -98,14 +98,14 @@ def load(self, config_arg, inputs=None, outputs=None):
 
         if self.args.enable_mkldnn and not self.args.enable_gpu:
             config.disable_gpu()
-            config.enable_mkldnn()
+            config.enable_onednn()
             if self.args.precision == 'int8':
-                config.enable_mkldnn_int8(
+                config.enable_onednn_int8(
                     {"conv2d", "depthwise_conv2d", "transpose2", "pool2d"}
                 )
         if not self.args.enable_mkldnn and not self.args.enable_gpu:
             config.disable_gpu()
-            # config.enable_mkldnn()
+            # config.enable_onednn()
         if self.args.enable_profile:
             config.enable_profile()
         shape_range_file = os.path.join(
diff --git a/test/deprecated/legacy_test/test_attribute_var_deprecated.py b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
index 8f6e2b7091e3b7..0d041549188a20 100644
--- a/test/deprecated/legacy_test/test_attribute_var_deprecated.py
+++ b/test/deprecated/legacy_test/test_attribute_var_deprecated.py
@@ -51,7 +51,7 @@ def infer_prog(self):
             config = paddle_infer.Config(
                 self.save_path + '.pdmodel', self.save_path + '.pdiparams'
             )
-        config.disable_mkldnn()
+        config.disable_onednn()
         predictor = paddle_infer.create_predictor(config)
         input_names = predictor.get_input_names()
         for i, shape in enumerate(self.shapes):
diff --git a/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py b/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py
index b9e6379945aa76..f555bd7ff11ad7 100644
--- a/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py
+++ b/test/deprecated/mkldnn/test_mkldnn_cpu_bfloat16_pass_deprecated.py
@@ -54,7 +54,7 @@ def init_data(self):
         self.shape_x = [12, 10, 1]
         self.shape_y = [12, 1, 64]
         self.enable_mkldnn = True
-        self.enable_mkldnn_bfloat16 = True
+        self.enable_onednn_bfloat16 = True
 
     def test_check_output(self):
         use_gpu = False
diff --git a/test/deprecated/quantization/CMakeLists.txt b/test/deprecated/quantization/CMakeLists.txt
index 009c4df6cdd863..c5b4d9d3a67137 100644
--- a/test/deprecated/quantization/CMakeLists.txt
+++ b/test/deprecated/quantization/CMakeLists.txt
@@ -157,7 +157,7 @@ function(inference_quant2_int8_lstm_model_test target fp32_model quant_model
          ${dataset_path}
          --num_threads
          1
-         --mkldnn_cache_capacity
+         --onednn_cache_capacity
          100
          --warmup_iter
          100
diff --git a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
index 393196a971766d..d7221b53ecbd50 100644
--- a/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
+++ b/test/deprecated/quantization/test_quant2_int8_mkldnn_pass_deprecated.py
@@ -19,12 +19,12 @@
 import paddle
 from paddle.base.framework import IrGraph
 from paddle.framework import core
-from paddle.static.quantization import Quant2Int8MkldnnPass
+from paddle.static.quantization import Quant2Int8OnednnPass
 
 paddle.enable_static()
 
 
-class TestQuant2Int8MkldnnPassMul(unittest.TestCase):
+class TestQuant2Int8OnednnPassMul(unittest.TestCase):
     def op_name(self):
         return "mul"
 
@@ -80,7 +80,7 @@ def test_dequantize_op_weights(self):
                     break
             assert op_node != "", f"op of type {self.op_name()} not found"
 
-            qpass = Quant2Int8MkldnnPass(
+            qpass = Quant2Int8OnednnPass(
                 self.quantized_ops,
                 _scope=self.scope,
                 _place=self.place,
@@ -125,12 +125,12 @@ def test_dequantize_op_weights(self):
                 qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
 
 
-class TestQuant2Int8MkldnnPassMatmulV2(TestQuant2Int8MkldnnPassMul):
+class TestQuant2Int8OnednnPassMatmulV2(TestQuant2Int8OnednnPassMul):
     def op_name(self):
         return "matmul_v2"
 
 
-class TestQuant2Int8MkldnnPassConv2D(unittest.TestCase):
+class TestQuant2Int8OnednnPassConv2D(unittest.TestCase):
     def setUp(self):
         self.scope = paddle.static.global_scope()
         self.place = paddle.CPUPlace()
@@ -225,17 +225,17 @@ def test_quant_update_activation(self):
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
             self.check_graph_before_pass(graph)
-            quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
+            quant2_int8_onednn_pass = Quant2Int8OnednnPass(
                 self.quantized_ops,
                 _scope=self.scope,
                 _place=self.place,
                 _core=core,
                 _debug=False,
             )
-            graph = quant2_int8_mkldnn_pass._update_activations(graph)
+            graph = quant2_int8_onednn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
-    class TestQuant2Int8MkldnnPassNearestInterp(unittest.TestCase):
+    class TestQuant2Int8OnednnPassNearestInterp(unittest.TestCase):
         def op_name(self):
             return "nearest_interp"
 
@@ -357,7 +357,7 @@ def test_quant_update_activation(self):
             with paddle.static.program_guard(program):
                 self.prepare_program(program)
                 graph = IrGraph(core.Graph(program.desc), for_test=True)
-                quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
+                quant2_int8_onednn_pass = Quant2Int8OnednnPass(
                     self.quantized_ops,
                     _scope=self.scope,
                     _place=self.place,
@@ -366,12 +366,12 @@ def test_quant_update_activation(self):
                 )
 
                 input_scale_tensor = (
-                    quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    quant2_int8_onednn_pass._convert_scale2tensor(
                         np.array(self.scale).astype(np.float64)
                     )
                 )
                 output_scale_tensor = (
-                    quant2_int8_mkldnn_pass._convert_scale2tensor(
+                    quant2_int8_onednn_pass._convert_scale2tensor(
                         np.array(1.0 / self.scale * self.scale).astype(
                             np.float64
                         )
@@ -383,12 +383,12 @@ def test_quant_update_activation(self):
                     "conv_output": (False, output_scale_tensor),
                 }
                 if core.avx_supported():
-                    quant2_int8_mkldnn_pass._var_quant_scales = var_scale
-                    graph = quant2_int8_mkldnn_pass._propagate_scales(graph)
-                    graph = quant2_int8_mkldnn_pass._quantize_fp32_graph(graph)
+                    quant2_int8_onednn_pass._var_quant_scales = var_scale
+                    graph = quant2_int8_onednn_pass._propagate_scales(graph)
+                    graph = quant2_int8_onednn_pass._quantize_fp32_graph(graph)
                     self.check_graph_after_pass(graph)
 
-    class TestQuant2Int8MkldnnPassNearestInterpV2(unittest.TestCase):
+    class TestQuant2Int8OnednnPassNearestInterpV2(unittest.TestCase):
         def op_name(self):
             return "nearest_interp_v2"
 
diff --git a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
index 2bdbed71f72e07..addd9aad1179b9 100644
--- a/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
+++ b/test/deprecated/quantization/test_quantization_mkldnn_pass_deprecated.py
@@ -22,7 +22,7 @@
 from paddle.base.framework import IrGraph
 from paddle.framework import core
 from paddle.static.quantization import (
-    QuantInt8MkldnnPass,
+    QuantInt8OnednnPass,
     QuantizationFreezePass,
     QuantizationTransformPass,
 )
@@ -98,7 +98,7 @@ def build_program(self, main, startup, is_test, seed):
                 opt.minimize(loss)
         return [img, label], loss
 
-    def mkldnn_based_freeze_graph(
+    def onednn_based_freeze_graph(
         self,
         use_cuda,
         seed,
@@ -174,8 +174,8 @@ def mkldnn_based_freeze_graph(
         freeze_pass.apply(test_graph)
 
         # Transform quantized graph for MKL-DNN INT8 inference
-        mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place)
-        mkldnn_int8_pass.apply(test_graph)
+        onednn_int8_pass = QuantInt8OnednnPass(_scope=scope, _place=place)
+        onednn_int8_pass.apply(test_graph)
         dev_name = '_cpu_'
         if not for_ci:
             marked_nodes = set()
@@ -191,7 +191,7 @@ def mkldnn_based_freeze_graph(
                 + weight_quant_type,
                 marked_nodes,
             )
-        mkldnn_program = test_graph.to_program()
+        onednn_program = test_graph.to_program()
 
         # Check the transformation weights of conv2d and mul
         conv_w_mkldnn = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
@@ -202,7 +202,7 @@ def mkldnn_based_freeze_graph(
 
         # Check if the conv2d output and mul output are correctly linked to fake_dequantize's
         # output
-        self.check_program(mkldnn_program)
+        self.check_program(onednn_program)
         if not for_ci:
             print(
                 '{}: {}'.format(
@@ -215,16 +215,16 @@ def mkldnn_based_freeze_graph(
                 )
             )
 
-    def test_mkldnn_graph_cpu_static(self):
+    def test_onednn_graph_cpu_static(self):
         with paddle.utils.unique_name.guard():
-            self.mkldnn_based_freeze_graph(
+            self.onednn_based_freeze_graph(
                 False,
                 seed=2,
                 activation_quant_type='range_abs_max',
                 weight_quant_type='abs_max',
                 for_ci=True,
             )
-            self.mkldnn_based_freeze_graph(
+            self.onednn_based_freeze_graph(
                 False,
                 seed=2,
                 activation_quant_type='moving_average_abs_max',
diff --git a/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py b/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py
index 5b147c409067fc..89702aa04b162c 100755
--- a/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py
+++ b/test/deprecated/tokenizer/test_faster_tokenizer_op_deprecated.py
@@ -139,7 +139,7 @@ def __init__(self, model_dir):
 
         # fast_tokenizer op only support cpu.
         config.disable_gpu()
-        config.disable_mkldnn()
+        config.disable_onednn()
         config.set_cpu_math_library_num_threads(10)
 
         config.switch_use_feed_fetch_ops(False)
diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py
index 0ceb053c50d5d9..5ae8ed1fb44ab1 100755
--- a/test/ir/inference/auto_scan_test.py
+++ b/test/ir/inference/auto_scan_test.py
@@ -252,6 +252,7 @@ def create_inference_config(
         passes: list[str] | None = None,
         use_gpu: bool = False,
         use_mkldnn: bool = False,
+        use_onednn: bool = False,
         use_xpu: bool = False,
         ir_optim: bool | None = None,
     ):
@@ -263,8 +264,10 @@ def create_inference_config(
             config.switch_ir_optim(ir_optim)
         if use_gpu:
             config.enable_use_gpu(100, 0)
-        if not use_mkldnn:
-            config.disable_mkldnn()
+        if use_mkldnn:
+            use_onednn = True
+        if not use_onednn:
+            config.disable_onednn()
         if use_xpu:
             config.enable_xpu()
         if passes is not None:
@@ -273,7 +276,7 @@ def create_inference_config(
         return config
 
 
-class MkldnnAutoScanTest(AutoScanTest):
+class OnednnAutoScanTest(AutoScanTest):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -370,14 +373,14 @@ def run_test(self, quant=False, *args, **kwargs):
 
     def inference_config_str(self, config) -> str:
         dic = {}
-        enable_mkldnn = config.mkldnn_enabled()
-        dic["use_mkldnn"] = enable_mkldnn
+        enable_onednn = config.onednn_enabled()
+        dic["use_onednn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic["use_gpu"] = enable_gpu
         return str(dic)
 
 
-class PirMkldnnAutoScanTest(MkldnnAutoScanTest):
+class PirOnednnAutoScanTest(OnednnAutoScanTest):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -616,8 +619,8 @@ def run_test(self, quant=False, prog_configs=None):
 
     def inference_config_str(self, config) -> str:
         dic = {}
-        enable_mkldnn = config.mkldnn_enabled()
-        dic["use_mkldnn"] = enable_mkldnn
+        enable_onednn = config.onednn_enabled()
+        dic["use_onednn"] = enable_onednn
         enable_gpu = config.use_gpu()
         dic['use_gpu'] = enable_gpu
         enable_xpu = config.use_xpu()
diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py
index 88c6debf574140..34bdfb4d2c16c5 100644
--- a/test/ir/inference/inference_pass_test.py
+++ b/test/ir/inference/inference_pass_test.py
@@ -38,7 +38,7 @@ def __init__(self, methodName='runTest'):
         self.fetch_list = None
 
         self.enable_mkldnn = False
-        self.enable_mkldnn_bfloat16 = False
+        self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = False
         self.trt_parameters = None
@@ -144,7 +144,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
-        config.disable_mkldnn()
+        config.disable_onednn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)
@@ -179,9 +179,9 @@ def _get_analysis_config(
                     config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
-            config.enable_mkldnn()
-            if self.enable_mkldnn_bfloat16:
-                config.enable_mkldnn_bfloat16()
+            config.enable_onednn()
+            if self.enable_onednn_bfloat16:
+                config.enable_onednn_bfloat16()
         print('config summary:', config.summary())
         return config
 
@@ -287,23 +287,23 @@ def check_output_with_option(
 
         # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
-            mkldnn_outputs = self._get_inference_outs(
+            onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
                     use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
-                len(paddle_outs) == len(mkldnn_outputs),
+                len(paddle_outs) == len(onednn_outputs),
                 "The number of outputs is different between CPU and MKLDNN. ",
             )
 
-            if self.enable_mkldnn_bfloat16:
+            if self.enable_onednn_bfloat16:
                 atol = 0.01
-            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
+            for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs):
                 np.testing.assert_allclose(
                     np.array(paddle_out),
-                    mkldnn_output,
+                    onednn_output,
                     rtol=1e-05,
                     atol=atol,
                     err_msg='Output has diff between CPU and MKLDNN. ',
diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
index fff8f988178d26..f955273a88667f 100644
--- a/test/ir/inference/quant_dequant_test.py
+++ b/test/ir/inference/quant_dequant_test.py
@@ -47,7 +47,7 @@ def __init__(self, methodName='runTest'):
         self.feeds = None
         self.fetch_list = None
         self.enable_mkldnn = False
-        self.enable_mkldnn_bfloat16 = False
+        self.enable_onednn_bfloat16 = False
         self.enable_trt = False
         self.enable_tensorrt_varseqlen = True
         self.trt_parameters = None
@@ -204,7 +204,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
-        config.disable_mkldnn()
+        config.disable_onednn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)
@@ -231,9 +231,9 @@ def _get_analysis_config(
                     config.enable_tensorrt_varseqlen()
 
         elif use_mkldnn:
-            config.enable_mkldnn()
-            if self.enable_mkldnn_bfloat16:
-                config.enable_mkldnn_bfloat16()
+            config.enable_onednn()
+            if self.enable_onednn_bfloat16:
+                config.enable_onednn_bfloat16()
         print('config summary:', config.summary())
         return config
 
@@ -389,23 +389,23 @@ def check_output_with_option(
 
         # Check whether the onednn results and the CPU results are the same.
         if (not use_gpu) and self.enable_mkldnn:
-            mkldnn_outputs = self._get_inference_outs(
+            onednn_outputs = self._get_inference_outs(
                 self._get_analysis_config(
                     use_gpu=use_gpu, use_mkldnn=self.enable_mkldnn
                 )
             )
 
             self.assertTrue(
-                len(paddle_outs) == len(mkldnn_outputs),
+                len(paddle_outs) == len(onednn_outputs),
                 "The number of outputs is different between CPU and MKLDNN. ",
             )
 
-            if self.enable_mkldnn_bfloat16:
+            if self.enable_onednn_bfloat16:
                 atol = 0.01
-            for paddle_out, mkldnn_output in zip(paddle_outs, mkldnn_outputs):
+            for paddle_out, onednn_output in zip(paddle_outs, onednn_outputs):
                 np.testing.assert_allclose(
                     np.array(paddle_out),
-                    mkldnn_output,
+                    onednn_output,
                     rtol=1e-05,
                     atol=atol,
                     err_msg='Output has diff between CPU and MKLDNN. ',
diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
index 1106e672df270b..8392b19875abfa 100755
--- a/test/ir/inference/test_conv_act_onednn_fuse_pass.py
+++ b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
@@ -21,7 +21,7 @@
 
 class TestConvActOneDNNFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=False, use_mkldnn=True)
+        config = self.create_inference_config(use_gpu=False, use_onednn=True)
         yield config, ['fused_conv2d'], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):
diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py
index 2483012d47197a..9cfd09d53ca9e7 100644
--- a/test/ir/inference/test_conv_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_bn_fuse_pass.py
@@ -60,7 +60,7 @@ def sample_program_config(self, draw):
                 st.integers(min_value=1, max_value=2), min_size=2, max_size=2
             )
         )
-        use_mkldnn = draw(st.booleans())
+        use_onednn = draw(st.booleans())
         epsilon = draw(st.floats(min_value=0.0, max_value=0.001))
 
         x_shape = (
@@ -108,7 +108,7 @@ def generate_bn_Var():
             groups=groups,
             paddings=paddings,
             strides=strides,
-            use_mkldnn=use_mkldnn,
+            use_mkldnn=use_onednn,
             has_bias=False,
             is_test=True,
         )
@@ -159,7 +159,7 @@ def generate_bn_Var():
     def sample_predictor_configs(self, program_config):
         # for onednn
         if program_config.ops[0].attrs['use_mkldnn']:
-            config = self.create_inference_config(use_mkldnn=True)
+            config = self.create_inference_config(use_onednn=True)
             yield config, ['fused_conv2d'], (1e-5, 1e-5)
         else:
             config = self.create_inference_config()
@@ -183,7 +183,7 @@ def add_ignore_pass_case(self):
         def teller1(program_config, predictor_config):
             if (
                 program_config.ops[0].attrs['data_format'] == "NHWC"
-                and not predictor_config.mkldnn_enabled()
+                and not predictor_config.onednn_enabled()
             ):
                 return True
             return False
diff --git a/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
index e7c6d6395606c8..ec013b5b89719a 100755
--- a/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_eltwiseadd_bn_fuse_pass.py
@@ -47,7 +47,7 @@ def sample_predictor_configs(self, program_config):
 
         # MKLDNN
         config = self.create_inference_config(use_gpu=False)
-        config.enable_mkldnn()
+        config.enable_onednn()
         yield config, ["conv2d", "elementwise_add"], (1e-4, 1e-5)
 
         # for gpu
diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
index d623feffcf4aa0..31e9bc98973814 100644
--- a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -195,7 +195,7 @@ def generate_batch_norm_Variance():
     def sample_predictor_configs(self, program_config):
         # for onednn
         if program_config.ops[0].attrs['use_mkldnn']:
-            config = self.create_inference_config(use_mkldnn=True)
+            config = self.create_inference_config(use_onednn=True)
             yield config, ['conv2d_transpose_bias'], (1e-5, 1e-5)
         # for cpu
         else:
diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
index ca6506c8938936..50b19a7ffba3a4 100644
--- a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
+++ b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -221,7 +221,7 @@ def generate_batch_norm_Variance():
     def sample_predictor_configs(self, program_config):
         # for onednn
         if program_config.ops[2].attrs['use_mkldnn']:
-            config = self.create_inference_config(use_mkldnn=True)
+            config = self.create_inference_config(use_onednn=True)
             yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5)
         # cpu
         else:
diff --git a/test/ir/inference/test_matmul_scale_fuse_pass.py b/test/ir/inference/test_matmul_scale_fuse_pass.py
index 67728e12a30250..92820db32fc182 100644
--- a/test/ir/inference/test_matmul_scale_fuse_pass.py
+++ b/test/ir/inference/test_matmul_scale_fuse_pass.py
@@ -36,7 +36,7 @@ def sample_predictor_configs(self, program_config):
         ], (1e-5, 1e-5)
 
         # onednn
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, [
             "matmul",
         ], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
index 65a456f3a0a841..4eafcbb3d8b16e 100644
--- a/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
+++ b/test/ir/inference/test_matmul_v2_scale_fuse_pass.py
@@ -36,7 +36,7 @@ def sample_predictor_configs(self, program_config):
         # yield config, ["matmul_v2", ], (1e-5, 1e-5)
 
         # onednn
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, [
             "matmul_v2",
         ], (1e-5, 1e-5)
diff --git a/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py b/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
index 98be6e451d08c7..91885e03032987 100644
--- a/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv3d_bias_fuse_pass.py
@@ -116,7 +116,7 @@ def generate_weight2():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["conv3d"], (1e-5, 1e-5)
 
     # TODO(baoachun)
diff --git a/test/ir/inference/test_mkldnn_conv3d_op.py b/test/ir/inference/test_mkldnn_conv3d_op.py
index 7a258626db7e8b..e6593042d8f55f 100644
--- a/test/ir/inference/test_mkldnn_conv3d_op.py
+++ b/test/ir/inference/test_mkldnn_conv3d_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest, PirMkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest, PirOnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMkldnnConv3dOp(MkldnnAutoScanTest):
+class TestMkldnnConv3dOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -75,7 +75,7 @@ def generate_weight(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
@@ -91,7 +91,7 @@ def test(self, *args, **kwargs):
         self.run_test(*args, **kwargs)
 
 
-class TestPirOneDNNPad3DOp(PirMkldnnAutoScanTest):
+class TestPirOneDNNPad3DOp(PirOnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -145,7 +145,7 @@ def generate_weight(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
index fd60d7b65193a9..c277e19b3d4f20 100644
--- a/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py
@@ -130,7 +130,7 @@ def generate_scale_bias():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
 
     def add_ignore_pass_case(self):
@@ -144,7 +144,7 @@ def teller1(program_config, predictor_config):
         # onednn Output has diff with bias!
         def teller2(program_config, predictor_config):
             return (
-                predictor_config.mkldnn_enabled()
+                predictor_config.onednn_enabled()
                 and program_config.ops[0].attrs['has_bias']
             )
 
diff --git a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
index 196e1f5909fe9c..15ad02a8fb3783 100644
--- a/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py
@@ -95,7 +95,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["fused_conv2d"], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
index c13888adf1a95a..1381df923ed843 100644
--- a/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py
@@ -92,7 +92,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["fused_conv2d"], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
index f0e1d8c74179b2..cf9355a9ac8d05 100644
--- a/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py
@@ -97,7 +97,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["fused_conv2d"], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
index cc0ccb809a1f14..1ef842da9d0cf8 100644
--- a/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_mish_fuse_pass.py
@@ -96,7 +96,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["fused_conv2d"], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
index 9e3c9efd9b2a2c..d6b4f70ff27a96 100644
--- a/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py
@@ -105,7 +105,7 @@ def generate_weight2():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['conv2d_transpose_bias'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
index b2b02a52014ae7..108ea3385d823b 100644
--- a/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
+++ b/test/ir/inference/test_mkldnn_depthwise_conv_pass.py
@@ -122,7 +122,7 @@ def generate_conv2d_Filter():
 
     def sample_predictor_configs(self, program_config):
         # for onednn
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['conv2d'], (1e-5, 1e-5)
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
diff --git a/test/ir/inference/test_mkldnn_log_softmax_op.py b/test/ir/inference/test_mkldnn_log_softmax_op.py
index e9c028515b0001..be911541394042 100644
--- a/test/ir/inference/test_mkldnn_log_softmax_op.py
+++ b/test/ir/inference/test_mkldnn_log_softmax_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest):
+class TestMKLDNNLogSoftmaxOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -51,7 +51,7 @@ def generate_input(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
index 16d7ae7baf5164..d6be1efaa34353 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_elementwise_add_fuse_pass.py
@@ -86,7 +86,7 @@ def generate_input(type):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
index ef4865b4d782a8..45c697117e0c90 100644
--- a/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
+++ b/test/ir/inference/test_mkldnn_matmul_v2_transpose_reshape_fuse_pass.py
@@ -129,7 +129,7 @@ def generate_input(type):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_matmulv2_op.py b/test/ir/inference/test_mkldnn_matmulv2_op.py
index 9a72e806b32268..2c5698d6567584 100644
--- a/test/ir/inference/test_mkldnn_matmulv2_op.py
+++ b/test/ir/inference/test_mkldnn_matmulv2_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMkldnnMatmulv2Op(MkldnnAutoScanTest):
+class TestMkldnnMatmulv2Op(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if len(program_config.inputs["input_data2"].shape) == 4:
             if (
@@ -113,7 +113,7 @@ def generate_input(type, *args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_mish_op.py b/test/ir/inference/test_mkldnn_mish_op.py
index c3e4bccf6ec68c..abf580836237a5 100644
--- a/test/ir/inference/test_mkldnn_mish_op.py
+++ b/test/ir/inference/test_mkldnn_mish_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMkldnnMishOp(MkldnnAutoScanTest):
+class TestMkldnnMishOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # if mode is channel, and in_shape is 1 rank
         if (
@@ -60,7 +60,7 @@ def generate_input(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_pad3d_op.py b/test/ir/inference/test_mkldnn_pad3d_op.py
index f8bd247dfa64d9..eb411b82118ec0 100644
--- a/test/ir/inference/test_mkldnn_pad3d_op.py
+++ b/test/ir/inference/test_mkldnn_pad3d_op.py
@@ -17,7 +17,7 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest, PirMkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest, PirOnednnAutoScanTest
 from hypothesis import given
 from program_config import (
     OpConfig,
@@ -26,7 +26,7 @@
 )
 
 
-class TestOneDNNPad3DOp(MkldnnAutoScanTest):
+class TestOneDNNPad3DOp(OnednnAutoScanTest):
     def sample_program_configs(self, *args, **kwargs):
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
@@ -60,7 +60,7 @@ def generate_paddings():
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
@@ -82,7 +82,7 @@ def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
 
 
-class TestPirOneDNNPad3DOp(PirMkldnnAutoScanTest):
+class TestPirOneDNNPad3DOp(PirOnednnAutoScanTest):
     def sample_program_configs(self, *args, **kwargs):
         def generate_input(*args, **kwargs):
             return np.random.random(kwargs['in_shape']).astype(np.float32)
@@ -117,7 +117,7 @@ def generate_paddings():
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_prelu_op.py b/test/ir/inference/test_mkldnn_prelu_op.py
index cab24fb22178da..c6f8b5b6ac2653 100644
--- a/test/ir/inference/test_mkldnn_prelu_op.py
+++ b/test/ir/inference/test_mkldnn_prelu_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMkldnnPreluOp(MkldnnAutoScanTest):
+class TestMkldnnPreluOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # if mode is channel, and in_shape is 1 rank
         if (
@@ -85,7 +85,7 @@ def generate_alpha(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     def add_skip_pass_case(self):
diff --git a/test/ir/inference/test_mkldnn_shape_op.py b/test/ir/inference/test_mkldnn_shape_op.py
index 69e18e08d32a5d..31603b81d4d49a 100644
--- a/test/ir/inference/test_mkldnn_shape_op.py
+++ b/test/ir/inference/test_mkldnn_shape_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMkldnnShapeOp(MkldnnAutoScanTest):
+class TestMkldnnShapeOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -52,7 +52,7 @@ def generate_input(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
index 8ebcfaf1041b6b..1a9ae3d8f64177 100644
--- a/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
+++ b/test/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
@@ -130,7 +130,7 @@ def generate_input():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ["shuffle_channel"], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_mkldnn_shuffle_channel_op.py b/test/ir/inference/test_mkldnn_shuffle_channel_op.py
index 64843f08156c65..d5b61dcc962ce3 100644
--- a/test/ir/inference/test_mkldnn_shuffle_channel_op.py
+++ b/test/ir/inference/test_mkldnn_shuffle_channel_op.py
@@ -17,12 +17,12 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from auto_scan_test import MkldnnAutoScanTest
+from auto_scan_test import OnednnAutoScanTest
 from hypothesis import given
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
 
-class TestMKLDNNShuffleChannelOp(MkldnnAutoScanTest):
+class TestMKLDNNShuffleChannelOp(OnednnAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -51,7 +51,7 @@ def generate_input(*args, **kwargs):
         yield program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, (1e-5, 1e-5)
 
     @given(
diff --git a/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py b/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py
index 84fefa24230fcd..5c7c091ca4f445 100644
--- a/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py
+++ b/test/ir/inference/test_onednn_batch_norm_act_fuse_pass.py
@@ -108,7 +108,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['batch_norm'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
index 565b4f92446cac..ac82c4997da3af 100644
--- a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
@@ -21,7 +21,7 @@
 
 class TestConvBiasOneDNNFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=False, use_mkldnn=True)
+        config = self.create_inference_config(use_gpu=False, use_onednn=True)
         yield config, ['fused_conv2d'], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):
diff --git a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
index 21c154615cdee3..da95b32fcda80b 100644
--- a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
@@ -136,7 +136,7 @@ def generate_data(shape):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_conv2d'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
index f45190a5084f24..06b383f8aa2716 100644
--- a/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_concat_activation_fuse_pass.py
@@ -156,7 +156,7 @@ def generate_data(input_type):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_conv2d', 'fused_conv2d', 'concat'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
index fe51b2d0e38924..acce128f2fd3e9 100644
--- a/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
+++ b/test/ir/inference/test_onednn_conv_elementwise_add_fuse_pass.py
@@ -116,7 +116,7 @@ def generate_weight():
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['relu', 'conv2d', 'fused_conv2d'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
index 99c731a1d9dfb7..a7861b1ef7a7e1 100644
--- a/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
+++ b/test/ir/inference/test_onednn_matmul_transpose_reshape_fuse_pass.py
@@ -109,7 +109,7 @@ def generate_input(type):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
index c60a3071010126..70337fc48b9963 100644
--- a/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
+++ b/test/ir/inference/test_onednn_reshape_transpose_matmul_fuse_pass.py
@@ -145,7 +145,7 @@ def generate_input2(attrs):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(use_onednn=True)
         yield config, ['fused_matmul'], (1e-5, 1e-5)
 
     def test(self):
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 3902f8c6b98a75..e6eca9654f330e 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -389,29 +389,38 @@ def convert_uint16_to_float(in_list):
     return np.reshape(out, in_list.shape)
 
 
-def get_places(string_format=False):
+def get_places():
     places = []
-    if not string_format:
-        if (
-            os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower()
-            in ['1', 'true', 'on']
-            or not core.is_compiled_with_cuda()
-        ):
-            places.append(base.CPUPlace())
-        if core.is_compiled_with_cuda():
-            places.append(base.CUDAPlace(0))
-    else:
-        if (
-            os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower()
-            in ['1', 'true', 'on']
-            or not paddle.is_compiled_with_cuda()
-        ):
-            places.append('cpu')
-        if paddle.is_compiled_with_cuda():
-            places.append('gpu')
+    if (
+        os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower()
+        in ['1', 'true', 'on']
+        or not core.is_compiled_with_cuda()
+    ):
+        places.append(base.CPUPlace())
+    if core.is_compiled_with_cuda():
+        places.append(base.CUDAPlace(0))
+    if is_custom_device():
+        dev_type = paddle.device.get_all_custom_device_type()[0]
+        places.append(base.CustomPlace(dev_type, 0))
     return places
 
 
+def get_devices():
+    devices = []
+    if (
+        os.environ.get('FLAGS_CI_both_cpu_and_gpu', 'False').lower()
+        in ['1', 'true', 'on']
+        or not paddle.is_compiled_with_cuda()
+    ):
+        devices.append('cpu')
+    if paddle.is_compiled_with_cuda():
+        devices.append('gpu')
+    if is_custom_device():
+        dev_type = paddle.device.get_all_custom_device_type()[0]
+        devices.append(f'{dev_type}:0')
+    return devices
+
+
 def get_device_place():
     if core.is_compiled_with_cuda():
         return base.CUDAPlace(0)
@@ -423,6 +432,15 @@ def get_device_place():
     return base.CPUPlace()
 
 
+def is_custom_device():
+    custom_dev_types = paddle.device.get_all_custom_device_type()
+    if custom_dev_types and paddle.device.is_compiled_with_custom_device(
+        custom_dev_types[0]
+    ):
+        return True
+    return False
+
+
 @contextmanager
 def auto_parallel_test_guard(test_info_path, generated_test_file_path):
     test_info_file, generated_test_file = None, None
@@ -2902,6 +2920,13 @@ def _get_places(self):
                     return [place]
                 else:
                     return []
+            elif is_custom_device():
+                dev_type = paddle.device.get_all_custom_device_type()[0]
+                place = core.CustomPlace(dev_type, 0)
+                if core.is_float16_supported(place):
+                    return [place]
+                else:
+                    return []
             else:
                 return []
         places = []
@@ -2931,6 +2956,9 @@ def _get_places(self):
             and not cpu_only
         ):
             places.append(core.CUDAPlace(0))
+        if is_custom_device():
+            dev_type = paddle.device.get_all_custom_device_type()[0]
+            places.append(core.CustomPlace(dev_type, 0))
         return places
 
     def check_output(
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index a03b55c29008ea..a40ce6f718094d 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -23,6 +23,7 @@
     convert_float_to_uint16,
     get_device_place,
     get_places,
+    is_custom_device,
 )
 from scipy.special import erf, expit
 from utils import static_guard
@@ -497,7 +498,8 @@ def init_shape(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or core.is_compiled_with_rocm(),
     "core is not compiled with CUDA",
 )
 class TestSigmoidBF16(OpTest):
@@ -1765,7 +1767,8 @@ def init_dtype(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or core.is_compiled_with_rocm(),
     "core is not compiled with CUDA",
 )
 class TestSqrtBF16(OpTest):
@@ -2037,7 +2040,7 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
         self.convert_input_output()
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             self.__class__.no_need_check_grad = True
 
     def init_shape(self):
@@ -2091,7 +2094,7 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
         self.convert_input_output()
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             self.__class__.no_need_check_grad = True
 
     def init_shape(self):
@@ -4563,7 +4566,8 @@ def init_shape(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or core.is_compiled_with_rocm(),
     "core is not compiled with CUDA",
 )
 class TestSquareBF16(OpTest):
@@ -4917,7 +4921,8 @@ def init_shape(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or core.is_compiled_with_rocm(),
     "core is not compiled with CUDA",
 )
 class TestSoftplusBF16(OpTest):
@@ -5595,7 +5600,8 @@ def test_errors(self):
 # ------------------ Test Cudnn Activation----------------------
 def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
     )
     class TestActCudnn(parent):
         def init_kernel_type(self):
diff --git a/test/legacy_test/test_adadelta_op.py b/test/legacy_test/test_adadelta_op.py
index 1650f246c25755..9dfa5d3e6380e1 100644
--- a/test/legacy_test/test_adadelta_op.py
+++ b/test/legacy_test/test_adadelta_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, get_device_place, get_places
+from op_test import OpTest, get_device_place, get_devices
 
 import paddle
 from paddle import base
@@ -294,7 +294,7 @@ def _test_adadelta_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.enable_static()
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._test_adadelta_op_dygraph_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_adagrad_op.py b/test/legacy_test/test_adagrad_op.py
index 0b5d1fef458200..c5497d51f25bd7 100644
--- a/test/legacy_test/test_adagrad_op.py
+++ b/test/legacy_test/test_adagrad_op.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 from op import Operator
-from op_test import OpTest, get_device_place, get_places
+from op_test import OpTest, get_device_place, get_devices, get_places
 
 import paddle
 from paddle.base import core
@@ -242,7 +242,7 @@ def _test_adagrad_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.enable_static()
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._test_adagrad_op_dygraph_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py
index c2ebbea1653ad3..4875c0dda23c83 100644
--- a/test/legacy_test/test_adam_op.py
+++ b/test/legacy_test/test_adam_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from op import Operator
-from op_test import OpTest, get_places
+from op_test import OpTest, get_devices, get_places
 
 import paddle
 from paddle import base
@@ -1296,7 +1296,7 @@ def _adam_optimize_static(
         return out
 
     def _get_places(self):
-        return get_places(string_format=True)
+        return get_devices()
 
     def _check_with_place_amp(self, place, use_amp):
         # test dygraph mode
diff --git a/test/legacy_test/test_adamax_op.py b/test/legacy_test/test_adamax_op.py
index 8b3532794d0f28..5670e4b2751b71 100644
--- a/test/legacy_test/test_adamax_op.py
+++ b/test/legacy_test/test_adamax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, get_device_place, get_places
+from op_test import OpTest, get_device_place, get_devices
 
 import paddle
 
@@ -275,7 +275,7 @@ def _test_adamax_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.enable_static()
 
     def _get_places(self):
-        return get_places(string_format=True)
+        return get_devices()
 
     def test_main(self):
         for place in self._get_places():
diff --git a/test/legacy_test/test_adamw_op.py b/test/legacy_test/test_adamw_op.py
index 904d87815427ec..1523468a75460d 100644
--- a/test/legacy_test/test_adamw_op.py
+++ b/test/legacy_test/test_adamw_op.py
@@ -18,7 +18,7 @@
 from functools import partial
 
 import numpy as np
-from op_test import OpTest, get_places
+from op_test import OpTest, get_devices
 
 import paddle
 from paddle import base, nn
@@ -758,7 +758,7 @@ def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
                 optimizer.clear_grad()
 
     def _get_places(self):
-        places = get_places(string_format=True)
+        places = get_devices()
         if paddle.is_compiled_with_xpu():
             places.append('xpu')
         return places
diff --git a/test/legacy_test/test_adaptive_log_softmax_with_loss.py b/test/legacy_test/test_adaptive_log_softmax_with_loss.py
index 6210e1d469bda5..29728b8b25476d 100644
--- a/test/legacy_test/test_adaptive_log_softmax_with_loss.py
+++ b/test/legacy_test/test_adaptive_log_softmax_with_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices, get_places
 
 import paddle
 import paddle.optimizer as optim
@@ -58,7 +58,7 @@ def predict(self, input):
 class TestNNAdaptiveLogSoftmaxWithLossAPI(unittest.TestCase):
     def setUp(self):
         paddle.seed(2024)
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
         self.log_np = np.random.randn(4, 8).astype('float32')
         self.predict_np = np.abs(np.random.randn(64, 8).astype('float32'))
 
diff --git a/test/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py
index 9ed3bffc0d9dad..9da566783ef4db 100644
--- a/test/legacy_test/test_attribute_var.py
+++ b/test/legacy_test/test_attribute_var.py
@@ -51,7 +51,7 @@ def infer_prog(self):
             config = paddle_infer.Config(
                 self.save_path + '.pdmodel', self.save_path + '.pdiparams'
             )
-        config.disable_mkldnn()
+        config.disable_onednn()
         predictor = paddle_infer.create_predictor(config)
         input_names = predictor.get_input_names()
         for i, shape in enumerate(self.shapes):
diff --git a/test/legacy_test/test_blha_get_max_len_op.py b/test/legacy_test/test_blha_get_max_len_op.py
index ab8b410a8c15ab..790e654dd4f1f6 100644
--- a/test/legacy_test/test_blha_get_max_len_op.py
+++ b/test/legacy_test/test_blha_get_max_len_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from op_test import is_custom_device
 
 import paddle
 from paddle.base import core
@@ -109,7 +110,8 @@ def test_static_api(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    and not core.is_compiled_with_xpu(),
     "Only support XPU or GPU in CUDA mode.",
 )
 class TestBlhaGetMaxLenOp_ZeroSize(unittest.TestCase):
diff --git a/test/legacy_test/test_cartesian_prod.py b/test/legacy_test/test_cartesian_prod.py
index 7246df017f8f7d..f7d0548a76527b 100644
--- a/test/legacy_test/test_cartesian_prod.py
+++ b/test/legacy_test/test_cartesian_prod.py
@@ -16,7 +16,7 @@
 from itertools import product
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 from paddle.base import core
@@ -36,7 +36,7 @@ def setUp(self):
         self.c_np = np.random.random(self.c_shape).astype(self.dtype_np)
         self.d_np = np.empty(0, self.dtype_np)
 
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def init_setting(self):
         self.dtype_np = 'float32'
@@ -119,7 +119,7 @@ def setUp(self):
         self.a_np = np.random.random(self.a_shape).astype(self.dtype_np)
         self.b_np = np.empty(0, self.dtype_np)
 
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def init_setting(self):
         self.dtype_np = 'float32'
diff --git a/test/legacy_test/test_cauchy_inplace.py b/test/legacy_test/test_cauchy_inplace.py
index 4aa41ce0ca130e..ebe03c9acf8f57 100644
--- a/test/legacy_test/test_cauchy_inplace.py
+++ b/test/legacy_test/test_cauchy_inplace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -35,7 +35,7 @@ def test_fp64():
             tensor_fp64.cauchy_()
             self.assertEqual(tensor_fp64.dtype, paddle.float64)
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_fp32()
             test_fp64()
@@ -92,7 +92,7 @@ def test_cauchy_inplace_distribution(self):
 class TestCauchyInplaceEmptyTensor(unittest.TestCase):
     def test_cauchy_inplace_op_empty_tensor(self):
         test_shapes = [(200, 1), (1, 200)]
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             for test_shape in test_shapes:
                 tensor = paddle.empty(shape=test_shape)
@@ -118,7 +118,7 @@ def test_grad():
             cauchy_grad = tensor_b.grad.numpy()
             self.assertTrue((cauchy_grad == 0).all())
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_grad()
 
diff --git a/test/legacy_test/test_class_center_sample_op.py b/test/legacy_test/test_class_center_sample_op.py
index ad8a19acc15770..8302df224bb2de 100644
--- a/test/legacy_test/test_class_center_sample_op.py
+++ b/test/legacy_test/test_class_center_sample_op.py
@@ -15,10 +15,9 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, paddle_static_guard
+from op_test import OpTest, get_places, paddle_static_guard
 
 import paddle
-from paddle.base import core
 
 
 def class_center_sample_numpy(label, classes_list, num_samples):
@@ -135,9 +134,7 @@ def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
         paddle.framework.random._manual_program_seed(2021)
-        self.places = [paddle.base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.base.CUDAPlace(0))
+        self.places = get_places()
 
     def initParams(self):
         self.batch_size = 10
@@ -235,9 +232,7 @@ class TestClassCenterSampleAPIError(unittest.TestCase):
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
-        self.places = [paddle.base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.base.CUDAPlace(0))
+        self.places = get_places()
 
     def initParams(self):
         self.batch_size = 20
@@ -275,9 +270,7 @@ class TestClassCenterSampleAPIError1(unittest.TestCase):
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
-        self.places = [paddle.base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.base.CUDAPlace(0))
+        self.places = get_places()
 
     def initParams(self):
         self.batch_size = 5
diff --git a/test/legacy_test/test_combinations.py b/test/legacy_test/test_combinations.py
index 1390fa90265895..f2f0e49fdd2748 100644
--- a/test/legacy_test/test_combinations.py
+++ b/test/legacy_test/test_combinations.py
@@ -16,7 +16,7 @@
 from itertools import combinations, combinations_with_replacement
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 from paddle.base import Program
@@ -47,7 +47,7 @@ def setUp(self):
         self.modify_setting()
         self.x_np = np.random.random(self.x_shape).astype(self.dtype_np)
 
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def init_setting(self):
         self.dtype_np = 'float64'
@@ -120,7 +120,7 @@ def modify_setting(self):
 
 class TestCombinationsEmpty(unittest.TestCase):
     def setUp(self):
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def test_dygraph(self):
         paddle.disable_static()
diff --git a/test/legacy_test/test_compat_split.py b/test/legacy_test/test_compat_split.py
new file mode 100644
index 00000000000000..8410e10e1e1caf
--- /dev/null
+++ b/test/legacy_test/test_compat_split.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.compat import split
+
+
+class TestCompatSplit(unittest.TestCase):
+    def _compare_with_origin(self, input_tensor, size, axis=0):
+        pd_results = split(input_tensor, size, dim=axis)
+
+        if isinstance(size, int):
+            shape_on_axis = input_tensor.shape[axis]
+            remaining_num = shape_on_axis % size
+            num_sections = shape_on_axis // size
+            if remaining_num == 0:
+                size = num_sections
+            else:
+                size = [size for _ in range(num_sections)]
+                size.append(remaining_num)
+
+        origin_results = paddle.split(
+            input_tensor, num_or_sections=size, axis=axis
+        )
+
+        self.assertEqual(len(origin_results), len(pd_results))
+
+        # check shape and output section size of the output
+        for origin_ts, pd_ts in zip(origin_results, pd_results):
+            np.testing.assert_allclose(origin_ts.numpy(), pd_ts.numpy())
+
+    def test_basic_split(self):
+        """Test basic splitting with integer size"""
+        data = paddle.arange(12).reshape([3, 4]).astype('float32')
+        self._compare_with_origin(data, 1, 0)
+        self._compare_with_origin(data, 2, 1)
+
+    def test_split_with_list_sections(self):
+        """Test splitting with list of section sizes"""
+        data = paddle.rand([10, 5])
+        self._compare_with_origin(data, [3, 2, 5], 0)
+        self._compare_with_origin(data, [1, 4], -1)
+
+    def test_chained_operations(self):
+        """Test split with complex operation chain"""
+        x = paddle.rand([8, 12])
+        y = paddle.sin(x) * 2.0 + paddle.exp(x) / 3.0
+        z = paddle.nn.functional.relu(y)
+
+        z1, z2 = split(z, 7, dim=1)
+
+        self.assertEqual(z1.shape, [8, 7])
+        self.assertEqual(z2.shape, [8, 5])
+
+        z_np = z.numpy()
+        np.testing.assert_allclose(z_np[:, :7], z1.numpy())
+        np.testing.assert_allclose(z_np[:, 7:], z2.numpy())
+
+    def test_split_grad(self):
+        """Test backprop for split, in1 and in2 are computed by
+        compat.split and original split"""
+
+        def get_tensors():
+            np.random.seed(114514)
+            np_arr = np.random.normal(0, 1, [2, 3, 4, 5])
+            return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr)
+
+        in1, in2 = get_tensors()
+        in1.stop_gradient = False
+        in2.stop_gradient = False
+
+        def computation_graph(in_tensor):
+            y = in_tensor * 2.3 + 3.0
+            y = paddle.maximum(y, paddle.to_tensor([0], dtype=paddle.float32))
+            return y.mean(axis=0)
+
+        out1 = computation_graph(in1)
+        out2 = computation_graph(in2)
+
+        packs1 = paddle.compat.split(out1, 2, dim=2)
+        packs2 = paddle.split(out2, [2, 2, 1], axis=2)
+
+        res1 = packs1[0] + packs1[1] + packs1[2]
+        res2 = packs2[0] + packs2[1] + packs2[2]
+        res1.backward()
+        res2.backward()
+        np.testing.assert_allclose(in1.grad.numpy(), in2.grad.numpy())
+
+    def test_empty_dim(self):
+        """Split with empty dim"""
+        in_tensor = paddle.arange(72, dtype=paddle.int64).reshape([3, 12, 2])
+        self._compare_with_origin(in_tensor, [5, 0, 7], axis=1)
+
+    def test_split_with_one_block(self):
+        """Resulting tuple should be of length 1"""
+        in_tensor = paddle.arange(60, dtype=paddle.float32).reshape([3, 4, 5])
+        self._compare_with_origin(in_tensor, 5, paddle.to_tensor([-1]))
+        self._compare_with_origin(in_tensor, [5], paddle.to_tensor(2))
+
+    def test_edge_cases(self):
+        """Test edge cases and error handling"""
+        x = paddle.arange(5)
+        s1, s2 = split(x, [3, 2])
+        np.testing.assert_allclose(s1.numpy(), [0, 1, 2])
+        np.testing.assert_allclose(s2.numpy(), [3, 4])
+
+        x = paddle.rand([2, 2, 2])
+        a, b = split(x, 1, 2)
+        self.assertEqual(a.shape, [2, 2, 1])
+
+        # invalid split sections
+        with self.assertRaises(ValueError):
+            split(x, [3, 1], 1)
+
+        # invalid split axis
+        with self.assertRaises(ValueError):
+            split(x, 2, 3)
+
+    def test_error_hint(self):
+        """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa."""
+        x = paddle.randn([3, 9, 5])
+
+        msg_gt_1 = (
+            "paddle.split() received unexpected keyword arguments 'tensor', 'split_size_or_sections', 'dim'. "
+            "\nDid you mean to use paddle.compat.split() instead?"
+        )
+        msg_gt_2 = (
+            "paddle.compat.split() received unexpected keyword argument 'num_or_sections'. "
+            "\nDid you mean to use paddle.split() instead?"
+        )
+        msg_gt_3 = "(InvalidArgument) The dim is expected to be in range of [-3, 3), but got 3"
+        msg_gt_4 = "paddle.compat.split expects split_sizes have only non-negative entries, but got size = -5 on dim 2"
+
+        split_size = paddle.to_tensor([3])
+        msg_gt_5 = (
+            "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode, but "
+            f"received {type(split_size)}."
+        )
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = paddle.split(tensor=x, split_size_or_sections=3, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_1)
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = split(x, num_or_sections=3, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_2)
+
+        with self.assertRaises(ValueError) as cm:
+            tensors = split(x, 3, dim=3)
+        self.assertEqual(str(cm.exception), msg_gt_3)
+
+        with self.assertRaises(ValueError) as cm:
+            tensors = split(x, [3, 3, -5], -2)
+        self.assertEqual(str(cm.exception), msg_gt_4)
+
+        with self.assertRaises(TypeError) as cm:
+            tensors = split(x, split_size, 1)
+        self.assertEqual(str(cm.exception), msg_gt_5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_compat_split_static.py b/test/legacy_test/test_compat_split_static.py
new file mode 100644
index 00000000000000..f685121aabd750
--- /dev/null
+++ b/test/legacy_test/test_compat_split_static.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.compat import split
+
+
+class TestCompatSplitStatic(unittest.TestCase):
+    def _compare_with_origin_static(
+        self, input_shape, size, axis=0, dim_rank=-1
+    ):
+        """size_dim: -1 means we input size by int, 0 means 0-size tensor, 1 means tensor with shape [1]"""
+        numel = 1
+        for v in input_shape:
+            numel *= v
+        input_axis = axis
+        if dim_rank == 0:
+            input_axis = paddle.to_tensor(axis)
+        elif dim_rank == 1:
+            input_axis = paddle.to_tensor([axis])
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            input_tensor = paddle.arange(numel, dtype=paddle.float32).reshape(
+                input_shape
+            )
+            pd_results = split(input_tensor, size, dim=input_axis)
+
+            if isinstance(size, int):
+                shape_on_axis = input_tensor.shape[axis]
+                remaining_num = shape_on_axis % size
+                num_sections = shape_on_axis // size
+                if remaining_num == 0:
+                    size = num_sections
+                else:
+                    size = [size for _ in range(num_sections)]
+                    size.append(remaining_num)
+
+            origin_results = paddle.split(
+                input_tensor, num_or_sections=size, axis=axis
+            )
+            assert len(pd_results) == len(origin_results), "length mismatched"
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+            results = exe.run(fetch_list=[*origin_results, *pd_results])
+            length_needed = len(results) // 2
+            for i in range(length_needed):
+                np.testing.assert_allclose(
+                    results[i], results[i + length_needed]
+                )
+        paddle.disable_static()
+
+    def test_split_composite_static(self):
+        paddle.seed(114514)
+
+        def get_tensors():
+            np.random.seed(114514)
+            np_arr = np.random.normal(0, 1, [2, 3, 4, 5])
+            return paddle.to_tensor(np_arr), paddle.to_tensor(np_arr)
+
+        in1, in2 = get_tensors()
+        in1.stop_gradient = False
+        in2.stop_gradient = False
+
+        @paddle.jit.to_static
+        def computation_graph(in1: paddle.Tensor, in2: paddle.Tensor):
+            y1 = in1 * 1.5 + 1.0
+            y1 = paddle.minimum(y1, paddle.to_tensor([0], dtype=paddle.float32))
+            out1 = y1.mean(axis=0)
+
+            y2 = in2 * 1.5 + 1.0
+            y2 = paddle.minimum(y2, paddle.to_tensor([0], dtype=paddle.float32))
+            out2 = y2.mean(axis=0)
+
+            packs1 = paddle.compat.split(out1, 2, dim=2)
+            packs2 = paddle.split(out2, [2, 2, 1], axis=2)
+
+            res1 = packs1[0] + packs1[1] + packs1[2]
+            res2 = packs2[0] + packs2[1] + packs2[2]
+
+            return res1, res2
+
+        res1, res2 = computation_graph(in1, in2)
+        np.testing.assert_allclose(res1.numpy(), res2.numpy())
+
+    def test_static_graph(self):
+        """Test static graph execution"""
+        # fixed random seed for reproducibility
+        np.random.seed(114514)
+        # old static graph mode
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[None, 6], dtype='float32')
+            result0, result1 = split(x, split_size_or_sections=[3, 3], dim=1)
+            output = result0 * 2.0 + paddle.sin(result1)
+
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+
+            input_data = np.random.rand(3, 6).astype('float32')
+            feed = {'x': input_data}
+
+            results = exe.run(feed=feed, fetch_list=[result0, result1, output])
+
+            pd_result0, pd_result1 = results[0], results[1]
+            np.testing.assert_allclose(input_data[:, :3], pd_result0)
+            np.testing.assert_allclose(input_data[:, 3:], pd_result1)
+
+            expected_output = input_data[:, :3] * 2.0 + np.sin(
+                input_data[:, 3:]
+            )
+            np.testing.assert_allclose(
+                expected_output, results[2], rtol=1e-4, atol=1e-4
+            )
+
+        paddle.disable_static()
+
+    def test_static_graph_2(self):
+        """Test static graph execution"""
+        np.random.seed(114514)
+        axis = paddle.to_tensor(-1)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name='x', shape=[None, 9], dtype='float32')
+            result0, result1, result2 = split(x, 4, dim=axis)
+            output = result0 + result1 * result2
+
+            place = (
+                paddle.CUDAPlace(0)
+                if paddle.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+
+            input_data = np.random.rand(3, 9).astype('float32')
+            feed = {'x': input_data}
+
+            results = exe.run(
+                feed=feed, fetch_list=[result0, result1, result2, output]
+            )
+
+            np.testing.assert_allclose(input_data[:, 0:4], results[0])
+            np.testing.assert_allclose(input_data[:, 4:8], results[1])
+            np.testing.assert_allclose(input_data[:, 8:9], results[2])
+
+            expected_output = (
+                input_data[:, 0:4] + input_data[:, 4:8] * input_data[:, -1:]
+            )
+            np.testing.assert_allclose(
+                expected_output, results[3], rtol=1e-4, atol=1e-4
+            )
+
+        paddle.disable_static()
+
+    def test_error_hint(self):
+        """Test whether there will be correct exception when users pass paddle.split kwargs in paddle.compat.split, vice versa."""
+
+        msg_gt_1 = "split_size_or_sections must be greater than 0."
+        msg_gt_2 = "len(split_size_or_sections) must not be more than input.shape[dim]."
+        msg_gt_3 = "The type of 'split_size_or_sections' in split must be int, list or tuple in imperative mode."
+        msg_gt_4 = (
+            "'dim' is not allowed to be a pir.Value in a static graph: "
+            "\npir.Value can not be used for indexing python lists/tuples."
+        )
+
+        paddle.enable_static()
+        with self.assertRaises(AssertionError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, -2, dim=0)
+        self.assertEqual(str(cm.exception), msg_gt_1)
+
+        with self.assertRaises(AssertionError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, (1, 1, 1, 1, 2, 2), dim=-1)
+        self.assertEqual(str(cm.exception), msg_gt_2)
+
+        with self.assertRaises(TypeError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, paddle.to_tensor(2), dim=2)
+        self.assertEqual(str(cm.exception), msg_gt_3)
+
+        with self.assertRaises(TypeError) as cm:
+            x = paddle.randn([3, 4, 5])
+            tensors = split(x, 2, dim=paddle.to_tensor(2))
+        paddle.disable_static()
+        self.assertEqual(str(cm.exception), msg_gt_4)
+
+    def test_basic_split(self):
+        """Test basic splitting with integer size"""
+        input_shape = [3, 6]
+        self._compare_with_origin_static(input_shape, 1, 0)
+        self._compare_with_origin_static(input_shape, 3, -1)
+        self._compare_with_origin_static(input_shape, 4, dim_rank=0)
+        self._compare_with_origin_static(input_shape, 3, dim_rank=1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_cross_op.py b/test/legacy_test/test_cross_op.py
index 573021b0d07f88..601bb87927cef5 100644
--- a/test/legacy_test/test_cross_op.py
+++ b/test/legacy_test/test_cross_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, is_custom_device
 
 import paddle
 from paddle import base
@@ -77,7 +77,8 @@ def init_output(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestCrossFP16Op(TestCrossOp):
     def initTestCase(self):
diff --git a/test/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py
index 7301fbeafd0610..1362f4a6dd30a9 100644
--- a/test/legacy_test/test_determinant_op.py
+++ b/test/legacy_test/test_determinant_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, get_places
 
 import paddle
 
@@ -430,9 +430,7 @@ def setUp(self):
         self.x = np.vectorize(complex)(
             np.random.random(self.shape), np.random.random(self.shape)
         ).astype(self.dtype)
-        self.places = [paddle.CPUPlace()]
-        if paddle.base.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
         self.out_grad = (
             np.array([1 + 0j, 1 + 0j] * 3 * 3)
             .reshape(2, 3, 3)
@@ -502,9 +500,7 @@ def setUp(self):
         self.x = np.vectorize(complex)(
             np.random.random(self.shape), np.random.random(self.shape)
         ).astype(self.dtype)
-        self.places = [paddle.CPUPlace()]
-        if paddle.base.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
         self.out_grad = np.array([3 + 0j, 3 + 0j] * 6).reshape(2, 6)
         self.x_grad_ref_dy = self.get_numeric_grad(
             self.x, self.shape, self.out_grad
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index 170ee389b552f6..81cccded682c89 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -919,6 +919,121 @@ def test_dygraph(self):
                 )
 
 
+class TestDropout1DFAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        self.places = get_places()
+
+    def check_static_result(
+        self, place, input_name, input_shape, training=False, p=0.0
+    ):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            input_var = paddle.static.data(
+                name=input_name, shape=input_shape, dtype="float32"
+            )
+            res = paddle.nn.functional.dropout1d(
+                input=input_var, p=p, training=training
+            )
+            in_np = np.random.random(input_shape).astype("float32")
+            exe = base.Executor(place)
+            fetches = exe.run(
+                main_prog,
+                feed={input_name: in_np},
+                fetch_list=[res],
+            )
+
+            np.testing.assert_allclose(fetches[0], in_np, rtol=1e-05)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(
+                place=place,
+                input_name="input_2d",
+                input_shape=[3, 4],
+                training=False,
+                p=0.0,
+            )
+
+            self.check_static_result(
+                place=place,
+                input_name="input_3d",
+                input_shape=[2, 3, 4],
+                training=False,
+                p=0.0,
+            )
+
+            self.check_static_result(
+                place=place,
+                input_name="input_2d_1",
+                input_shape=[3, 4],
+                training=False,
+                p=1.0,
+            )
+
+            self.check_static_result(
+                place=place,
+                input_name="input_3d_1",
+                input_shape=[2, 3, 4],
+                training=False,
+                p=1.0,
+            )
+
+    def test_dygraph(self):
+        for place in self.places:
+            with base.dygraph.guard(place):
+                # Test 2D input
+                in_np_2d = np.random.random([3, 4]).astype("float32")
+                input_2d = paddle.to_tensor(in_np_2d)
+                res1 = paddle.nn.functional.dropout1d(
+                    input=input_2d, p=0.0, training=False
+                )
+                np.testing.assert_allclose(res1.numpy(), in_np_2d, rtol=1e-05)
+
+                # Test 3D input
+                in_np_3d = np.random.random([2, 3, 4]).astype("float32")
+                input_3d = paddle.to_tensor(in_np_3d)
+                res2 = paddle.nn.functional.dropout1d(
+                    input=input_3d, p=0.0, training=False
+                )
+                np.testing.assert_allclose(res2.numpy(), in_np_3d, rtol=1e-05)
+
+
+class TestDropout1DFAPIError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+
+            def test_xdim_1d():
+                # dimensions of x should be 2 or 3
+                x = paddle.static.data(name='x1', shape=[4], dtype="float32")
+                paddle.nn.functional.dropout1d(x)
+
+            self.assertRaises(RuntimeError, test_xdim_1d)
+
+            def test_xdim_4d():
+                # dimensions of x should be 2 or 3
+                x = paddle.static.data(
+                    name='x2', shape=[2, 3, 4, 5], dtype="float32"
+                )
+                paddle.nn.functional.dropout1d(x)
+
+            self.assertRaises(RuntimeError, test_xdim_4d)
+
+            def test_prob_range():
+                # p should be in [0, 1]
+                x = paddle.static.data(
+                    name='x3', shape=[2, 3, 4], dtype="float32"
+                )
+                paddle.nn.functional.dropout1d(x, p=1.5)
+
+            self.assertRaises(ValueError, test_prob_range)
+
+
 class TestDropout2DFAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
@@ -1404,6 +1519,12 @@ def test_p_tensor(self):
         np.testing.assert_array_equal(static_res, dygraph_res)
 
 
+class TestDropOut1DWithProbTensor(TestDropOutWithProbTensor):
+    def init_info(self):
+        self.shape = [2, 3, 4]
+        self.api = paddle.nn.functional.dropout1d
+
+
 class TestDropOut2DWithProbTensor(TestDropOutWithProbTensor):
     def init_info(self):
         self.shape = [2, 3, 10, 10]
diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py
index 982b1a310093b9..3620215c186114 100644
--- a/test/legacy_test/test_elementwise_mod_op.py
+++ b/test/legacy_test/test_elementwise_mod_op.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+    is_custom_device,
+)
 from utils import dygraph_guard, static_guard
 
 import paddle
@@ -124,7 +129,8 @@ def test_check_output(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestElementwiseModFP16Op(TestElementwiseModOp):
     def init_dtype(self):
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 9f4fcb43bec869..a4f365ea92b1a8 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -15,7 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    is_custom_device,
+    skip_check_grad_ci,
+)
 
 import paddle
 from paddle import base
@@ -472,7 +477,8 @@ def init_input_attr_output(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestElementwiseMulOpFp16(ElementwiseMulOp):
     def init_dtype(self):
diff --git a/test/legacy_test/test_embedding_scale_grad_by_freq.py b/test/legacy_test/test_embedding_scale_grad_by_freq.py
index 63e408a88422be..e996fc66c41033 100644
--- a/test/legacy_test/test_embedding_scale_grad_by_freq.py
+++ b/test/legacy_test/test_embedding_scale_grad_by_freq.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from op_test import get_places
 
 import paddle
 from paddle.nn.functional import embedding
@@ -32,9 +33,7 @@ def ref_embedding_scale_grad_(x, weight_unscaled_grad):
 class TestEmbeddingAPIScaleGradByFreq(unittest.TestCase):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CPUPlace()]
-        if paddle.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
     def init_data(self):
         self.dtype = "float32"
diff --git a/test/legacy_test/test_fused_gate_attention_op.py b/test/legacy_test/test_fused_gate_attention_op.py
index 43ee9ab844ee08..49f44c7f9b9d40 100644
--- a/test/legacy_test/test_fused_gate_attention_op.py
+++ b/test/legacy_test/test_fused_gate_attention_op.py
@@ -20,7 +20,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+    is_custom_device,
+)
 from test_sparse_attention_op import get_cuda_version
 
 import paddle
@@ -30,7 +35,8 @@
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "Paddle is not compiled with CUDA",
 )
 class TestFusedGateAttentionOp(OpTest):
     def setUp(self):
@@ -474,7 +480,7 @@ def setUp(self):
         ]
 
     def test_api(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             pass
 
         query = paddle.rand(shape=self.query_shape, dtype="float32")
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
index ce26cdff7ec858..b3a9ed4a09ffee 100644
--- a/test/legacy_test/test_fused_rotary_position_embedding.py
+++ b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import parameterized as param
+from op_test import is_custom_device
 
 import paddle
 from paddle.base import core
@@ -158,7 +159,8 @@ def paddle_fused_rotary_position_embedding(
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    and not paddle.is_compiled_with_rocm(),
     "core is not compiled with CUDA or ROCM ",
 )
 @param.parameterized_class(
@@ -693,7 +695,8 @@ def test_error2():
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(),
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    and not paddle.is_compiled_with_rocm(),
     "core is not compiled with CUDA or ROCM ",
 )
 class TestFusedRotaryPositionEmbeddingZeroSize(unittest.TestCase):
diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
index 59b80920234233..c4f860bcc7e973 100644
--- a/test/legacy_test/test_gaussian_random_op.py
+++ b/test/legacy_test/test_gaussian_random_op.py
@@ -15,7 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_uint16_to_float, paddle_static_guard
+from op_test import (
+    OpTest,
+    convert_uint16_to_float,
+    is_custom_device,
+    paddle_static_guard,
+)
 
 import paddle
 from paddle import base
@@ -61,7 +66,8 @@ def verify_output(self, outs):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestGaussianRandomFP16Op(OpTest):
     def setUp(self):
@@ -111,7 +117,8 @@ def gauss_wrapper(shape, mean, std, seed, dtype=np.uint16, name=None):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestGaussianRandomBF16Op(OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_geometric_inplace.py b/test/legacy_test/test_geometric_inplace.py
index 9b5177eac04b8b..baed59705189aa 100644
--- a/test/legacy_test/test_geometric_inplace.py
+++ b/test/legacy_test/test_geometric_inplace.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import scipy.stats
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -36,7 +36,7 @@ def test_fp64():
             tensor_fp64.geometric_(probs=0.3)
             self.assertEqual(tensor_fp64.dtype, paddle.float64)
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_fp32()
             test_fp64()
@@ -96,7 +96,7 @@ def test_geometric_inplace_distribution(self):
 class TestGeometricInplaceEmptyTensor(unittest.TestCase):
     def test_geometric_inplace_op_empty_tensor(self):
         test_shapes = [(200, 1), (1, 200)]
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             for test_shape in test_shapes:
                 tensor = paddle.empty(shape=test_shape)
@@ -122,7 +122,7 @@ def test_grad():
             geometric_grad = tensor_b.grad.numpy()
             self.assertTrue((geometric_grad == 0).all())
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_grad()
 
diff --git a/test/legacy_test/test_group_norm_op_v2.py b/test/legacy_test/test_group_norm_op_v2.py
index 2ae1a72c2c2b29..1a6c5aeafd8781 100644
--- a/test/legacy_test/test_group_norm_op_v2.py
+++ b/test/legacy_test/test_group_norm_op_v2.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_places, is_custom_device
 from utils import dygraph_guard
 
 import paddle
@@ -243,7 +243,7 @@ def test_numerical_accuracy(self):
 class TestGroupNormAPIV2_With_General_Dimensions_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
         # fp16 only supported in cuda
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shapes = [
@@ -286,7 +286,7 @@ def test_numerical_accuracy(self):
 
 class TestGroupNormAPIV2_With_NCL_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shape = (2, 6, 4)
@@ -327,7 +327,7 @@ def test_numerical_accuracy(self):
 
 class TestGroupNormAPIV2_With_NCDHW_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shape = (2, 6, 4, 2, 2)
@@ -368,7 +368,7 @@ def test_numerical_accuracy(self):
 
 class TestGroupNormAPIV2_With_NLC_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shape = (2, 4, 6)
@@ -409,7 +409,7 @@ def test_numerical_accuracy(self):
 
 class TestGroupNormAPIV2_With_NHWC_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shape = (2, 4, 2, 6)
@@ -450,7 +450,7 @@ def test_numerical_accuracy(self):
 
 class TestGroupNormAPIV2_With_NDHWC_fp16(unittest.TestCase):
     def test_numerical_accuracy(self):
-        if not core.is_compiled_with_cuda():
+        if not (core.is_compiled_with_cuda() or is_custom_device()):
             return
         paddle.disable_static()
         shape = (2, 4, 2, 2, 6)
diff --git a/test/legacy_test/test_imperative_triple_grad.py b/test/legacy_test/test_imperative_triple_grad.py
index 2cec3112913fd2..a873b58768279e 100644
--- a/test/legacy_test/test_imperative_triple_grad.py
+++ b/test/legacy_test/test_imperative_triple_grad.py
@@ -16,6 +16,7 @@
 from unittest import TestCase
 
 import numpy as np
+from op_test import get_devices
 
 import paddle
 from paddle import base
@@ -327,9 +328,7 @@ def setUp(self):
         self.input_numpy_dout = None
         self.input_numpy_ddx = None
         self.input_numpy_ddy = None
-        self.places = ["cpu"]
-        if paddle.is_compiled_with_cuda():
-            self.places.append("gpu")
+        self.places = get_devices()
 
     def actual(self):
         x = paddle.to_tensor(
@@ -657,9 +656,7 @@ def setUp(self):
         self.input_numpy_dout = None
         self.input_numpy_ddx = None
         self.input_numpy_ddy = None
-        self.places = ["cpu"]
-        if paddle.is_compiled_with_cuda():
-            self.places.append("gpu")
+        self.places = get_devices()
 
     def actual(self):
         x = paddle.to_tensor(
@@ -961,9 +958,7 @@ def setUp(self):
         self.input_numpy_dout = None
         self.input_numpy_ddx = None
         self.input_numpy_ddy = None
-        self.places = ["cpu"]
-        if paddle.is_compiled_with_cuda():
-            self.places.append("gpu")
+        self.places = get_devices()
 
     def actual(self):
         x = paddle.to_tensor(
diff --git a/test/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py
index c98652902aa845..b3383e1ce14cef 100644
--- a/test/legacy_test/test_index_add_op.py
+++ b/test/legacy_test/test_index_add_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, get_devices
 
 import paddle
 from paddle.base import core
@@ -199,10 +199,7 @@ def setType(self):
         self.index_type = np.int32
 
     def setPlace(self):
-        self.place = []
-        self.place.append('cpu')
-        if paddle.is_compiled_with_cuda():
-            self.place.append('gpu')
+        self.place = get_devices()
 
     def config(self):
         self.axis = 0
diff --git a/test/legacy_test/test_index_fill.py b/test/legacy_test/test_index_fill.py
index 32035caa8c3975..147439e7aa929d 100644
--- a/test/legacy_test/test_index_fill.py
+++ b/test/legacy_test/test_index_fill.py
@@ -16,7 +16,7 @@
 from itertools import combinations
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 from paddle.base import Program
@@ -44,7 +44,7 @@ def setUp(self):
             self.index_type
         )
 
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
         if self.dtype_np == 'float16' and 'cpu' in self.place:
             self.place.remove('cpu')
 
@@ -150,7 +150,7 @@ def setUp(self):
             self.index_type
         )
 
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
         if self.dtype_np == 'float16' and 'cpu' in self.place:
             self.place.remove('cpu')
 
diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py
index 8ef3499026e2b3..722742f2e84f97 100644
--- a/test/legacy_test/test_index_put_op.py
+++ b/test/legacy_test/test_index_put_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -120,7 +120,7 @@ def init_dtype_type(self):
         self.accumulate = False
 
     def setPlace(self):
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
         if self.dtype_np is np.float16 and "cpu" in self.place:
             self.place.remove("cpu")
 
@@ -620,7 +620,7 @@ def init_dtype_type(self):
         self.accumulate = False
 
     def setPlace(self):
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def test_dygraph_forward(self):
         paddle.disable_static()
@@ -661,7 +661,7 @@ def setUp(self):
         self.setPlace()
 
     def setPlace(self):
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
 
     def test_backward(self):
         paddle.disable_static()
@@ -1019,7 +1019,7 @@ def init_dtype_type(self):
         self.index_type_pd = paddle.int64
 
     def setPlace(self):
-        self.place = get_places(string_format=True)
+        self.place = get_devices()
         if self.dtype_np is np.float16 and "cpu" in self.place:
             self.place.remove("cpu")
 
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index 41ea4ebbf7625d..fa176448470075 100755
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -2090,9 +2090,7 @@ def test_broadcast_error(self):
 class TestDygraphInplaceSet(unittest.TestCase):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CPUPlace()]
-        if paddle.base.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
         self.support_dtypes = [
             'float32',
             'float64',
@@ -2274,7 +2272,7 @@ def leaf_inplace_error():
 class TestDygraphInplaceSetFP16(TestDygraphInplaceSet):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CUDAPlace(0)]
+        self.places = get_places()
 
     def init_data(self):
         self.x_np = np.random.uniform(-5, 5, [7, 20, 2])
@@ -2304,7 +2302,7 @@ def test_inplace_api(self):
 class TestDygraphInplaceSetBF16(TestDygraphInplaceSet):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CUDAPlace(0)]
+        self.places = get_places()
 
     def init_data(self):
         self.x_np = np.random.uniform(-5, 5, [7, 20, 2])
@@ -2329,9 +2327,7 @@ def test_inplace_api(self):
 class TestDygraphInplaceResize(unittest.TestCase):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CPUPlace()]
-        if paddle.base.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
         self.support_dtypes = [
             'float32',
             'float64',
@@ -2444,7 +2440,7 @@ def argument_error():
 class TestDygraphInplaceResizeFP16(TestDygraphInplaceResize):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CUDAPlace(0)]
+        self.places = get_places()
 
     def init_data(self):
         self.x_np = np.random.uniform(-5, 5, [3, 10, 2])
@@ -2472,7 +2468,7 @@ def test_inplace_api(self):
 class TestDygraphInplaceResizeBF16(TestDygraphInplaceResize):
     def setUp(self):
         self.init_data()
-        self.places = [paddle.CUDAPlace(0)]
+        self.places = get_places()
 
     def init_data(self):
         self.x_np = np.random.uniform(-5, 5, [3, 10, 2])
diff --git a/test/legacy_test/test_ldexp.py b/test/legacy_test/test_ldexp.py
index d4edd57e0cb39f..47d3025cd047bc 100644
--- a/test/legacy_test/test_ldexp.py
+++ b/test/legacy_test/test_ldexp.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices, get_places
 
 import paddle
 
@@ -86,7 +86,7 @@ def check_dtype(input, desired_dtype):
 
 class TestLdexpAPIWithDynamic(unittest.TestCase):
     def setUp(self):
-        self.places = get_places(string_format=True)
+        self.places = get_devices()
 
     def test_ldexp_dynamic(self):
         np.random.seed(7)
@@ -136,7 +136,7 @@ def test_ldexp_dynamic(self):
 
 class TestLdexpAPIWithStatic(unittest.TestCase):
     def setUp(self):
-        self.places = get_places(string_format=True)
+        self.places = get_devices()
 
     def test_ldexp_static(self):
         np.random.seed(7)
diff --git a/test/legacy_test/test_linalg_vecdot.py b/test/legacy_test/test_linalg_vecdot.py
index 2dafe849ad2bcd..7a251943e6a990 100644
--- a/test/legacy_test/test_linalg_vecdot.py
+++ b/test/legacy_test/test_linalg_vecdot.py
@@ -17,6 +17,7 @@
 import unittest
 
 import numpy as np
+from op_test import get_places
 
 import paddle
 from paddle.base import core
@@ -34,9 +35,7 @@ def setUp(self):
         self.init_config()
         self.generate_input()
         self.generate_expected_output()
-        self.places = [paddle.CPUPlace()]
-        if paddle.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
     def generate_input(self):
         np.random.seed(123)
diff --git a/test/legacy_test/test_log_normal_inplace.py b/test/legacy_test/test_log_normal_inplace.py
index 5cb29367ee7929..e2b25289a34128 100644
--- a/test/legacy_test/test_log_normal_inplace.py
+++ b/test/legacy_test/test_log_normal_inplace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -44,7 +44,7 @@ def test_fp64():
             tensor_fp64.log_normal_()
             self.assertEqual(tensor_fp64.dtype, paddle.float64)
 
-        places = get_places(string_format=True)
+        places = get_devices()
         for place in places:
             paddle.set_device(place)
             test_fp32()
@@ -105,7 +105,7 @@ def test_log_normal_inplace_op_distribution(self):
 
 class TestLogNormalRandomInplaceOpEmptyTensor(unittest.TestCase):
     def test_log_normal_inplace_op_empty_tensor(self):
-        places = get_places(string_format=True)
+        places = get_devices()
         test_shapes = [(200, 0), (0, 200)]
         for place in places:
             paddle.set_device(place)
@@ -133,7 +133,7 @@ def test_grad():
             log_normal_grad = tensor_b.grad.numpy()
             self.assertTrue((log_normal_grad == 0).all())
 
-        places = get_places(string_format=True)
+        places = get_devices()
         for place in places:
             paddle.set_device(place)
             test_grad()
diff --git a/test/legacy_test/test_margin_cross_entropy_op.py b/test/legacy_test/test_margin_cross_entropy_op.py
index e8f3de35941639..e7bbb93e7a072f 100644
--- a/test/legacy_test/test_margin_cross_entropy_op.py
+++ b/test/legacy_test/test_margin_cross_entropy_op.py
@@ -15,7 +15,13 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_places,
+    is_custom_device,
+    paddle_static_guard,
+)
 
 import paddle
 from paddle.base import core
@@ -329,16 +335,15 @@ def test_check_grad(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestMarginCrossEntropyOpV2(unittest.TestCase):
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
         paddle.framework.random._manual_program_seed(self.seed)
-        self.places = []
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.base.CUDAPlace(0))
+        self.places = get_places()
 
     def initParams(self):
         self.python_out_sig = ["Loss"]
@@ -501,16 +506,15 @@ def init_reduction(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestMarginCrossEntropyOpAPIError(unittest.TestCase):
     def setUp(self):
         self.initParams()
         np.random.seed(self.seed)
         paddle.framework.random._manual_program_seed(self.seed)
-        self.places = []
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.base.CUDAPlace(0))
+        self.places = get_places()
 
     def initParams(self):
         self.python_api = python_api
diff --git a/test/legacy_test/test_matmul_0_size_op.py b/test/legacy_test/test_matmul_0_size_op.py
index fc3f3c3230044b..795ffb1d9ce89a 100644
--- a/test/legacy_test/test_matmul_0_size_op.py
+++ b/test/legacy_test/test_matmul_0_size_op.py
@@ -14,13 +14,16 @@
 
 import unittest
 
+from op_test import is_custom_device
+
 import paddle
 from paddle import _C_ops
 from paddle.base import core
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "mamtul 0 size only with in cuda",
 )
 class TestMatmulDygraph(unittest.TestCase):
     def test_matmul(self):
diff --git a/test/legacy_test/test_max_op.py b/test/legacy_test/test_max_op.py
index 64e3cd15362003..741024f8059de4 100644
--- a/test/legacy_test/test_max_op.py
+++ b/test/legacy_test/test_max_op.py
@@ -156,9 +156,7 @@ def setUp(self):
         self.expect_res = np.max(
             self.data, axis=tuple(self.axis), keepdims=self.keepdims
         )
-        self.places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(core.CUDAPlace(0))
+        self.places = get_places()
 
     def test_static(self):
         with static_guard():
diff --git a/test/legacy_test/test_mean_op.py b/test/legacy_test/test_mean_op.py
index 464f8852ab3861..01ecd450383ec7 100644
--- a/test/legacy_test/test_mean_op.py
+++ b/test/legacy_test/test_mean_op.py
@@ -828,9 +828,7 @@ def setUp(self):
         self.x_np = np.random.randint(-1, 10000, self.x_shape).astype(
             self.dtype
         )
-        self.places = [paddle.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
     def test_dygraph(self):
         for place in self.places:
@@ -864,9 +862,7 @@ def setUp(self):
         self.x_np = np.random.randint(-1, 10000, self.x_shape).astype(
             self.dtype
         )
-        self.places = [paddle.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
 
 class TestMeanAPIBool(TestMeanAPIInt32):
@@ -874,9 +870,7 @@ def setUp(self):
         self.x_shape = [2, 3, 4, 5]
         self.dtype = "bool"
         self.x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
-        self.places = [paddle.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
 
 class TestMeanWithTensorAxis1(TestReduceOPTensorAxisBase):
diff --git a/test/legacy_test/test_merged_adam_op.py b/test/legacy_test/test_merged_adam_op.py
index e590f7cfa9c900..e474a8978b4fea 100644
--- a/test/legacy_test/test_merged_adam_op.py
+++ b/test/legacy_test/test_merged_adam_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 from paddle import _C_ops
@@ -205,7 +205,7 @@ def run_op(use_merged):
 
     def test_main(self):
         for multi_precision in [False, True]:
-            for place in get_places(string_format=True):
+            for place in get_devices():
                 self.check_with_place(place, multi_precision)
 
 
diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py
index ef0cc06b117ab7..f162bfcc347938 100644
--- a/test/legacy_test/test_min_op.py
+++ b/test/legacy_test/test_min_op.py
@@ -143,9 +143,7 @@ def setUp(self):
         self.expect_res = np.min(
             self.data, axis=tuple(self.axis), keepdims=self.keepdims
         )
-        self.places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(core.CUDAPlace(0))
+        self.places = get_places()
 
     def test_static(self):
         with static_guard():
diff --git a/test/legacy_test/test_mode_op.py b/test/legacy_test/test_mode_op.py
index 227e966b47c05a..8064c53ac5bd9e 100644
--- a/test/legacy_test/test_mode_op.py
+++ b/test/legacy_test/test_mode_op.py
@@ -15,7 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+    is_custom_device,
+)
 
 import paddle
 from paddle import base
@@ -121,7 +126,8 @@ def test_check_grad(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestModeFP16Op(TestModeOp):
     def init_dtype(self):
@@ -168,7 +174,8 @@ def init_args(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestModeFP16OpLastdim(TestModeFP16Op):
     def init_args(self):
@@ -177,7 +184,8 @@ def init_args(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestModeBF16OpLastdim(TestModeBF16Op):
     def init_args(self):
diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py
index fb68dc9d91a23c..ec7411770ff3a9 100644
--- a/test/legacy_test/test_momentum_op.py
+++ b/test/legacy_test/test_momentum_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from op import Operator
-from op_test import OpTest, get_places
+from op_test import OpTest, get_devices, get_places
 
 import paddle
 from paddle import base
@@ -1036,7 +1036,7 @@ def _check_with_param_group(self, place, use_amp):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._check_with_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_msort_op.py b/test/legacy_test/test_msort_op.py
new file mode 100644
index 00000000000000..aac9e4764e2702
--- /dev/null
+++ b/test/legacy_test/test_msort_op.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.base import core
+
+
+class TestMsortOnCPU(unittest.TestCase):
+    def setUp(self):
+        self.place = core.CPUPlace()
+
+    def test_api_0(self):
+        with base.program_guard(base.Program()):
+            input = paddle.static.data(
+                name="input", shape=[2, 3, 4], dtype="float32"
+            )
+            output = paddle.msort(input=input)
+            exe = base.Executor(self.place)
+            data = np.array(
+                [
+                    [[5, 8, 9, 5], [0, 0, 1, 7], [6, 9, 2, 4]],
+                    [[5, 2, 4, 2], [4, 7, 7, 9], [1, 7, 0, 6]],
+                ],
+                dtype='float32',
+            )
+            (result,) = exe.run(feed={'input': data}, fetch_list=[output])
+            np_result = np.sort(result, axis=0)
+            self.assertEqual((result == np_result).all(), True)
+
+
+class TestMsortOnGPU(TestMsortOnCPU):
+    def init_place(self):
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+
+class TestMsortDygraph(unittest.TestCase):
+    def setUp(self):
+        self.input_data = np.random.rand(10, 10)
+        if core.is_compiled_with_cuda():
+            self.place = core.CUDAPlace(0)
+        else:
+            self.place = core.CPUPlace()
+
+    def test_api_0(self):
+        paddle.disable_static(self.place)
+        var_x = paddle.to_tensor(self.input_data)
+        out = paddle.msort(input=var_x)
+        self.assertEqual(
+            (np.sort(self.input_data, axis=0) == out.numpy()).all(), True
+        )
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_multi_label_soft_margin_loss.py b/test/legacy_test/test_multi_label_soft_margin_loss.py
index 29cf724d7e69f3..5f4e8b6e33fa55 100644
--- a/test/legacy_test/test_multi_label_soft_margin_loss.py
+++ b/test/legacy_test/test_multi_label_soft_margin_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -145,7 +145,7 @@ def test_MultiLabelSoftMarginLoss(self):
         input = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64)
         label = np.random.randint(0, 2, size=(5, 5)).astype(np.float64)
 
-        places = get_places(string_format=True)
+        places = get_devices()
         reductions = ['sum', 'mean', 'none']
         for place in places:
             for reduction in reductions:
diff --git a/test/legacy_test/test_nadam_op.py b/test/legacy_test/test_nadam_op.py
index 509eba6dc66176..e84723ffed7e4a 100644
--- a/test/legacy_test/test_nadam_op.py
+++ b/test/legacy_test/test_nadam_op.py
@@ -16,7 +16,7 @@
 from copy import deepcopy
 
 import numpy as np
-from op_test import OpTest, get_device_place, get_places
+from op_test import OpTest, get_device_place, get_devices, get_places
 
 import paddle
 from paddle import base
@@ -460,7 +460,7 @@ def _test_nadam_dygraph_place_amp(self, place, use_amp=False):
                 optimizer.clear_grad()
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._test_nadam_dygraph_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_normal_inplace.py b/test/legacy_test/test_normal_inplace.py
index 762595bdd52ae8..775d38fdbaff4d 100644
--- a/test/legacy_test/test_normal_inplace.py
+++ b/test/legacy_test/test_normal_inplace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -43,7 +43,7 @@ def test_fp64():
             tensor_fp64.normal_()
             self.assertEqual(tensor_fp64.dtype, paddle.float64)
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_fp32()
             test_fp64()
@@ -64,7 +64,7 @@ def test_fp64():
             tensor_fp64.normal_()
             self.assertEqual(tensor_fp64.dtype, paddle.complex128)
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_fp32()
             test_fp64()
@@ -164,7 +164,7 @@ def test_normal_inplace_op_distribution(self):
 class TestNormalRandomInplaceOpEmptyTensor(unittest.TestCase):
     def test_normal_inplace_op_empty_tensor(self):
         test_shapes = [(200, 0), (0, 200)]
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             for test_shape in test_shapes:
                 tensor = paddle.empty(shape=test_shape)
@@ -190,7 +190,7 @@ def test_grad():
             normal_grad = tensor_b.grad.numpy()
             self.assertTrue((normal_grad == 0).all())
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_grad()
 
@@ -215,7 +215,7 @@ def test_grad():
             self.assertTrue((normal_grad.real == 0).all())
             self.assertTrue((normal_grad.imag == 0).all())
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_grad()
 
diff --git a/test/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py
index 46c3ab42ab99f3..e1ed377e851841 100644
--- a/test/legacy_test/test_pad3d_op.py
+++ b/test/legacy_test/test_pad3d_op.py
@@ -15,7 +15,12 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, get_places
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_places,
+    is_custom_device,
+)
 
 import paddle
 import paddle.nn.functional as F
@@ -221,7 +226,8 @@ def test_check_output(self):
 
 def create_test_fp16(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
     )
     class TestPad3dFp16(parent):
         def get_dtype(self):
@@ -304,7 +310,8 @@ def test_check_grad_normal(self):
 # ----------------Pad3d complex64----------------
 def create_test_complex64(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
     )
     class TestPad3dComplex64(parent):
         def get_dtype(self):
@@ -344,7 +351,8 @@ def test_check_grad_normal(self):
 
 def create_test_complex128(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
     )
     class TestPad3dComplex128(parent):
         def get_dtype(self):
diff --git a/test/legacy_test/test_pow.py b/test/legacy_test/test_pow.py
index 087c748337bf67..b3f32797cb43d7 100755
--- a/test/legacy_test/test_pow.py
+++ b/test/legacy_test/test_pow.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 from paddle.static import Program, program_guard
@@ -79,7 +79,7 @@ class TestPowerAPI(unittest.TestCase):
     """TestPowerAPI."""
 
     def setUp(self):
-        self.places = get_places(string_format=True)
+        self.places = get_devices()
 
     def test_power(self):
         """test_power."""
@@ -227,7 +227,7 @@ class TestPowerAPI_ZeroSize(unittest.TestCase):
     """TestPowerAPI."""
 
     def setUp(self):
-        self.places = get_places(string_format=True)
+        self.places = get_devices()
 
     def _test_power(self, shape):
         np.random.seed(7)
diff --git a/test/legacy_test/test_pow_op.py b/test/legacy_test/test_pow_op.py
index 9cab82ca7f9755..cd8d5200b6b258 100644
--- a/test/legacy_test/test_pow_op.py
+++ b/test/legacy_test/test_pow_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, get_places
 
 import paddle
 from paddle.framework import core
@@ -39,9 +39,7 @@ def setUp(self):
             self.outputs = {
                 'Out': np.power(self.inputs['X'], self.attrs["factor"])
             }
-        self.places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(core.CUDAPlace(0))
+        self.places = get_places()
 
     def custom_setting(self):
         self.inputs = {
diff --git a/test/legacy_test/test_psroi_pool_op.py b/test/legacy_test/test_psroi_pool_op.py
index 1f954aa102ee05..aac28c59297ebe 100644
--- a/test/legacy_test/test_psroi_pool_op.py
+++ b/test/legacy_test/test_psroi_pool_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, get_places
+from op_test import OpTest, get_devices, get_places
 
 import paddle
 
@@ -228,7 +228,7 @@ def test_dytype_is_float64():
             )
             np.testing.assert_allclose(out, expect_out, rtol=1e-05)
 
-        places = get_places(string_format=True)
+        places = get_devices()
         for place in places:
             paddle.set_device(place)
             test_output_size_is_int()
@@ -282,7 +282,7 @@ def test_dytype_is_float64():
             np.testing.assert_allclose(out, expect_out, rtol=1e-05)
 
         paddle.disable_static()
-        places = get_places(string_format=True)
+        places = get_devices()
         for place in places:
             paddle.set_device(place)
             test_output_size_is_int()
diff --git a/test/legacy_test/test_radam_op.py b/test/legacy_test/test_radam_op.py
index 27124e841a58d1..23efcbf887ba25 100644
--- a/test/legacy_test/test_radam_op.py
+++ b/test/legacy_test/test_radam_op.py
@@ -16,7 +16,7 @@
 from copy import deepcopy
 
 import numpy as np
-from op_test import OpTest, get_device_place, get_places
+from op_test import OpTest, get_device_place, get_devices, get_places
 
 import paddle
 from paddle import base
@@ -471,7 +471,7 @@ def _test_radam_dygraph_place_amp(self, place, use_amp=False):
                 optimizer.clear_grad()
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._test_radam_dygraph_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_random_seed.py b/test/legacy_test/test_random_seed.py
index 2af2bfff71551b..2ef5fdc7e4a23d 100644
--- a/test/legacy_test/test_random_seed.py
+++ b/test/legacy_test/test_random_seed.py
@@ -16,6 +16,7 @@
 import unittest
 
 import numpy as np
+from op_test import is_custom_device
 
 import paddle
 from paddle import base
@@ -51,7 +52,10 @@ def test_generator_uniform_random_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
 
@@ -85,7 +89,7 @@ def test_generator_uniform_random_static(self):
             out2_res2 = np.array(out2[1])
 
             if (
-                not core.is_compiled_with_cuda()
+                not (core.is_compiled_with_cuda() or is_custom_device())
                 and not core.is_compiled_with_xpu()
             ):
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
@@ -107,7 +111,10 @@ def test_gen_dropout_dygraph(self):
         y_np = y.numpy()
         y1_np = y1.numpy()
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             print(">>>>>>> dropout dygraph >>>>>>>")
             np.testing.assert_allclose(y_np, y1_np, rtol=1e-05)
 
@@ -132,7 +139,10 @@ def test_gen_dropout_static(self):
         out1_np = np.array(out1[0])
         out2_np = np.array(out2[0])
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             print(">>>>>>> dropout static >>>>>>>")
             np.testing.assert_allclose(out1_np, out2_np, rtol=1e-05)
 
@@ -153,7 +163,10 @@ def test_generator_gaussian_random_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             print(">>>>>>> gaussian random dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -188,7 +201,7 @@ def test_generator_gaussian_random_static(self):
             out2_res2 = np.array(out2[1])
 
             if (
-                not core.is_compiled_with_cuda()
+                not (core.is_compiled_with_cuda() or is_custom_device())
                 and not core.is_compiled_with_xpu()
             ):
                 print(">>>>>>> gaussian random static >>>>>>>")
@@ -213,7 +226,10 @@ def test_generator_randint_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             print(">>>>>>> randint dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -248,7 +264,7 @@ def test_generator_uniform_random_static_1(self):
             out2_res2 = np.array(out2[1])
 
             if (
-                not core.is_compiled_with_cuda()
+                not (core.is_compiled_with_cuda() or is_custom_device())
                 and not core.is_compiled_with_xpu()
             ):
                 np.testing.assert_allclose(out1_res1, out2_res1, rtol=1e-05)
@@ -271,7 +287,10 @@ def test_generator_randint_dygraph_1(self):
         x1_np = x1.numpy()
         x2_np = x2.numpy()
         x3_np = x3.numpy()
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
 
@@ -305,7 +324,7 @@ def test_generator_ranint_static(self):
             out2_res2 = np.array(out2[1])
 
             if (
-                not core.is_compiled_with_cuda()
+                not (core.is_compiled_with_cuda() or is_custom_device())
                 and not core.is_compiled_with_xpu()
             ):
                 print(">>>>>>> randint static >>>>>>>")
@@ -331,7 +350,10 @@ def test_generator_randperm_dygraph(self):
         x2_np = x2.numpy()
         x3_np = x3.numpy()
 
-        if not core.is_compiled_with_cuda() and not core.is_compiled_with_xpu():
+        if (
+            not (core.is_compiled_with_cuda() or is_custom_device())
+            and not core.is_compiled_with_xpu()
+        ):
             print(">>>>>>> randperm dygraph >>>>>>>")
             np.testing.assert_allclose(x1_np, x2_np, rtol=1e-05)
             np.testing.assert_allclose(x_np, x3_np, rtol=1e-05)
@@ -366,7 +388,7 @@ def test_generator_randperm_static(self):
             out2_res2 = np.array(out2[1])
 
             if (
-                not core.is_compiled_with_cuda()
+                not (core.is_compiled_with_cuda() or is_custom_device())
                 and not core.is_compiled_with_xpu()
             ):
                 print(">>>>>>> randperm static >>>>>>>")
diff --git a/test/legacy_test/test_ravel.py b/test/legacy_test/test_ravel.py
new file mode 100644
index 00000000000000..05a21e156219d5
--- /dev/null
+++ b/test/legacy_test/test_ravel.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+
+class TestPaddleRavel(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.array([[1, 2, 3], [4, 5, 6]], dtype="float32")
+        self.input_shape = self.input_np.shape
+        self.input_dtype = "float32"
+        self.op_static = lambda x: paddle.ravel(x)
+        self.op_dygraph = lambda x: paddle.ravel(x)
+        self.expected = lambda x: x.flatten()
+        self.places = [None, paddle.CPUPlace()]
+
+    def check_static_result(self, place):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            input_name = 'input'
+            input_var = paddle.static.data(
+                name=input_name, shape=self.input_shape, dtype=self.input_dtype
+            )
+            res = self.op_static(input_var)
+            exe = base.Executor(place)
+            fetches = exe.run(
+                main_prog,
+                feed={input_name: self.input_np},
+                fetch_list=[res],
+            )
+            expect = (
+                self.expected(self.input_np)
+                if callable(self.expected)
+                else self.expected
+            )
+            np.testing.assert_allclose(fetches[0], expect, rtol=1e-05)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def check_dygraph_result(self, place):
+        with base.dygraph.guard(place):
+            input = paddle.to_tensor(self.input_np, stop_gradient=False)
+            result = self.op_dygraph(input)
+            expect = (
+                self.expected(self.input_np)
+                if callable(self.expected)
+                else self.expected
+            )
+            # check forward
+            np.testing.assert_allclose(result.numpy(), expect, rtol=1e-05)
+
+            # check backward
+            paddle.autograd.backward([result])
+            np.testing.assert_allclose(
+                input.grad.numpy(), np.ones_like(self.input_np), rtol=1e-05
+            )
+
+    def test_dygraph(self):
+        for place in self.places:
+            self.check_dygraph_result(place=place)
+
+
+class TestPaddleRavel_case1(TestPaddleRavel):
+    def setUp(self):
+        # check Ravel 1d
+        self.input_np = np.array([7, 8, 9], dtype="float32")
+        self.input_shape = self.input_np.shape
+        self.input_dtype = "float32"
+        self.op_static = lambda x: paddle.ravel(x)
+        self.op_dygraph = lambda x: paddle.ravel(x)
+        self.expected = lambda x: x.flatten()
+        self.places = [None, paddle.CPUPlace()]
+
+
+class TestPaddleRavel_case2(TestPaddleRavel):
+    def setUp(self):
+        # check Ravel 3d
+        self.input_np = np.arange(24, dtype="float32").reshape(2, 3, 4)
+        self.input_shape = self.input_np.shape
+        self.input_dtype = "float32"
+        self.op_static = lambda x: paddle.ravel(x)
+        self.op_dygraph = lambda x: paddle.ravel(x)
+        self.expected = lambda x: x.flatten()
+        self.places = [None, paddle.CPUPlace()]
+
+
+class TestPaddleRavel_case3(TestPaddleRavel):
+    def setUp(self):
+        # check Ravel 0d (scalar)
+        self.input_np = np.array(5.0, dtype="float32")
+        self.input_shape = self.input_np.shape
+        self.input_dtype = "float32"
+        self.op_static = lambda x: paddle.ravel(x)
+        self.op_dygraph = lambda x: paddle.ravel(x)
+        self.expected = lambda x: x.flatten()
+        self.places = [None, paddle.CPUPlace()]
+
+
+class TestPaddleRavel_case4(TestPaddleRavel):
+    def setUp(self):
+        # check Ravel empty array
+        self.input_np = np.array([], dtype="float32").reshape(0, 3)
+        self.input_shape = self.input_np.shape
+        self.input_dtype = "float32"
+        self.op_static = lambda x: paddle.ravel(x)
+        self.op_dygraph = lambda x: paddle.ravel(x)
+        self.expected = lambda x: x.flatten()
+        self.places = [None, paddle.CPUPlace()]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py
index 76b7a4a5b761a7..85e8b036d2b2fd 100644
--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -19,6 +19,7 @@
     OpTest,
     convert_float_to_uint16,
     get_places,
+    is_custom_device,
     skip_check_grad_ci,
 )
 from utils import dygraph_guard, static_guard
@@ -192,7 +193,8 @@ def test_check_grad(self):
 
 def create_test_fp16_class(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
     )
     class TestSumOpFp16(parent):
         def init_dtype(self):
@@ -341,9 +343,7 @@ class TestSumAPIZeroDimKeepDim(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         paddle.enable_static()
-        self.places = [paddle.CPUPlace()]
-        if paddle.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
     def test_static(self):
         for place in self.places:
@@ -2365,9 +2365,7 @@ def setUp(self):
             "complex64",
             "complex128",
         ]
-        self.places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(base.CUDAPlace(0))
+        self.places = get_places()
 
     def calculate_expected_result(self, x_np, axis, keepdim):
         expected_result = np.all(x_np, axis=axis, keepdims=keepdim)
@@ -2454,9 +2452,7 @@ def setUp(self):
             "complex64",
             "complex128",
         ]
-        self.places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(base.CUDAPlace(0))
+        self.places = get_places()
 
     def calculate_expected_result(self, x_np, axis, keepdim):
         expected_result = np.any(x_np, axis=axis, keepdims=keepdim)
diff --git a/test/legacy_test/test_restrict_nonzero.py b/test/legacy_test/test_restrict_nonzero.py
index a8d072710f0a7c..62a7607f193491 100644
--- a/test/legacy_test/test_restrict_nonzero.py
+++ b/test/legacy_test/test_restrict_nonzero.py
@@ -15,13 +15,15 @@
 import unittest
 
 import numpy as np
+from op_test import is_custom_device
 
 import paddle
 from paddle.base import core
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestRestrictNonzero(unittest.TestCase):
     def test_restrict_nonzero(self):
diff --git a/test/legacy_test/test_rmsprop_op.py b/test/legacy_test/test_rmsprop_op.py
index 56f682bffabc50..e814eb112ded27 100644
--- a/test/legacy_test/test_rmsprop_op.py
+++ b/test/legacy_test/test_rmsprop_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from op import Operator
-from op_test import get_device_place, get_places
+from op_test import get_device_place, get_devices, get_places
 
 import paddle
 from paddle import base
@@ -416,7 +416,7 @@ def _test_rms_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.enable_static()
 
     def test_main(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             use_amp_list = [True, False]
             for use_amp in use_amp_list:
                 self._test_rms_op_dygraph_place_amp(place, use_amp)
diff --git a/test/legacy_test/test_rrelu_op.py b/test/legacy_test/test_rrelu_op.py
index 97be548fcdf48f..e00ed4daba380a 100644
--- a/test/legacy_test/test_rrelu_op.py
+++ b/test/legacy_test/test_rrelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, get_device_place
 
 import paddle
 import paddle.nn.functional as F
@@ -50,13 +50,7 @@ def setUp(self):
         self.upper_0 = 0.25
         self.upper_1 = 0.33
 
-        self.places = [
-            (
-                base.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else base.CPUPlace()
-            )
-        ]
+        self.places = [get_device_place()]
 
     def check_static_result(self, place):
         with paddle.static.program_guard(
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index 2ff97d7ea7defc..c4ad490c8defb3 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16, get_places
+from op_test import OpTest, convert_float_to_uint16, get_devices
 
 import paddle
 from paddle.base import core
@@ -1277,7 +1277,7 @@ def _call_setitem_static_api(self, x):
         return x
 
     def test_api(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
 
             static_out = self._run_static()
diff --git a/test/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py
index f664f70a3b9917..be6ef62b1c0da0 100644
--- a/test/legacy_test/test_sign_op.py
+++ b/test/legacy_test/test_sign_op.py
@@ -194,10 +194,7 @@ def run(place):
 
 class TestSignComplexAPI(TestSignAPI):
     def setUp(self):
-        self.place = []
-        self.place.append(base.CPUPlace())
-        if core.is_compiled_with_cuda():
-            self.place.append(base.CUDAPlace(0))
+        self.place = get_places()
 
     def test_dygraph(self):
         with base.dygraph.guard():
diff --git a/test/legacy_test/test_soft_margin_loss.py b/test/legacy_test/test_soft_margin_loss.py
index 2bb726b4bcf71c..2dc2d9f76ed600 100644
--- a/test/legacy_test/test_soft_margin_loss.py
+++ b/test/legacy_test/test_soft_margin_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_places
+from op_test import get_devices, get_places
 
 import paddle
 
@@ -127,7 +127,7 @@ class TestSoftMarginLoss(unittest.TestCase):
     def test_SoftMarginLoss(self):
         input_np = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64)
         types = [np.int32, np.int64, np.float32, np.float64]
-        places = get_places(string_format=True)
+        places = get_devices()
         reductions = ['sum', 'mean', 'none']
         for place in places:
             for reduction in reductions:
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 6d8b4becc9b48e..0746cc46d022a9 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -889,9 +889,7 @@ class TestSum_BoolToInt64_ZeroSize(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.shape = [3, 0, 2]
-        self.places = [base.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(base.CUDAPlace(0))
+        self.places = get_places()
 
     def check_result(
         self, dygraph_result, expected_result, axis, keepdim, dtype, place
diff --git a/test/legacy_test/test_tensor_type_autocast.py b/test/legacy_test/test_tensor_type_autocast.py
index 865fc590bc159a..ee85c391cd415a 100644
--- a/test/legacy_test/test_tensor_type_autocast.py
+++ b/test/legacy_test/test_tensor_type_autocast.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from op_test import get_places
 
 import paddle
 
@@ -22,9 +23,7 @@
 class TestAutocastBase(unittest.TestCase):
     def setUp(self):
         self.set_api_and_dtypes()
-        self.places = [paddle.CPUPlace()]
-        if paddle.core.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
 
     def set_api_and_dtypes(self):
         pass
diff --git a/test/legacy_test/test_tensor_type_convert_api.py b/test/legacy_test/test_tensor_type_convert_api.py
new file mode 100644
index 00000000000000..0021c1d448d93b
--- /dev/null
+++ b/test/legacy_test/test_tensor_type_convert_api.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TensorDtypeConversionsTest(unittest.TestCase):
+    """
+    Unit tests for all supported tensor dtype conversion methods.
+    """
+
+    _supported_dtype_conversions = {
+        # float
+        'float16': 'float16',
+        'half': 'float16',
+        'bfloat16': 'bfloat16',
+        'float32': 'float32',
+        'float': 'float32',
+        'float64': 'float64',
+        'double': 'float64',
+        # int
+        'int8': 'int8',
+        'char': 'int8',
+        'uint8': 'uint8',
+        'byte': 'uint8',
+        'int16': 'int16',
+        'short': 'int16',
+        'int32': 'int32',
+        'int': 'int32',
+        'int64': 'int64',
+        'long': 'int64',
+        # other
+        'bool': 'bool',
+        'complex64': 'complex64',
+        'complex128': 'complex128',
+        'cfloat': 'complex64',
+        'cdouble': 'complex128',
+    }
+    _device = paddle.device.get_device()
+    _total_init_dtype = [
+        'float16',
+        'float32',
+        'float64',
+        'int8',
+        'uint8',
+        'int16',
+        'int32',
+        'int64',
+        'bool',
+        'complex64',
+        'complex128',
+    ]
+
+    def setUp(self):
+        self.shape = [10, 1000]
+
+    def _get_paddle_dtype(self, dtype_str):
+        """Get the Paddle dtype constant by string name."""
+        return getattr(paddle, dtype_str)
+
+    def test_bfloat16_conversion(self):
+        for init_dtype in self._total_init_dtype:
+            if self._device.startswith('xpu') and init_dtype == 'complex128':
+                continue
+            tensor = paddle.randn(self.shape).astype(init_dtype)
+            converted_tensor = tensor.bfloat16()
+            self.assertEqual(converted_tensor.dtype, paddle.bfloat16)
+            self.assertEqual(converted_tensor.shape, tensor.shape)
+
+        for (
+            method_name,
+            target_dtype,
+        ) in self._supported_dtype_conversions.items():
+            if self._device.startswith('xpu') and target_dtype == 'complex128':
+                continue
+            tensor = paddle.randn(self.shape).astype('bfloat16')
+            converted_tensor = getattr(tensor, method_name)()
+            self.assertEqual(
+                converted_tensor.dtype, self._get_paddle_dtype(target_dtype)
+            )
+            self.assertEqual(converted_tensor.shape, tensor.shape)
+
+    def test_all_dtype_conversions(self):
+        """Test all dtype conversion methods."""
+        for (
+            method_name,
+            target_dtype,
+        ) in self._supported_dtype_conversions.items():
+            if target_dtype == 'bfloat16':
+                continue
+            for init_dtype in self._total_init_dtype:
+                if self._device.startswith('xpu') and (
+                    target_dtype == 'complex128' or init_dtype == 'complex128'
+                ):
+                    self.skipTest("Skipping complex conversion tests on XPU")
+
+                with self.subTest(
+                    method=method_name,
+                    init_dtype=init_dtype,
+                    target_dtype=target_dtype,
+                ):
+                    self._test_single_dtype_conversion(
+                        method_name, init_dtype, target_dtype
+                    )
+
+    def _test_single_dtype_conversion(
+        self, method_name, init_dtype, target_dtype
+    ):
+        """Test a single dtype conversion method."""
+        if init_dtype.startswith('float'):
+            data_np = np.random.randn(*self.shape).astype(init_dtype)
+        elif init_dtype.startswith('complex'):
+            data_np_real = np.random.randn(*self.shape)
+            data_np_imag = np.random.randn(*self.shape)
+            data_np = data_np_real + data_np_imag * 1j
+            data_np = data_np.astype(init_dtype)
+        else:
+            data_np = np.random.randint(-100, 100, size=self.shape).astype(
+                init_dtype
+            )
+
+        tensor = paddle.to_tensor(data_np, dtype=init_dtype)
+
+        # Check if conversion method exists
+        self.assertTrue(
+            hasattr(tensor, method_name),
+            f"Tensor should have method '{method_name}'",
+        )
+        # Perform dtype conversion
+        converted_tensor = getattr(tensor, method_name)()
+
+        # Check the dtype after conversion
+        expected_dtype = self._get_paddle_dtype(target_dtype)
+        self.assertEqual(
+            converted_tensor.dtype,
+            expected_dtype,
+            f"Expected dtype {expected_dtype}, but got {converted_tensor.dtype} for method '{method_name}'",
+        )
+
+        # Check that the shape remains unchanged
+        self.assertEqual(
+            tensor.shape,
+            converted_tensor.shape,
+            f"Shape should remain unchanged after {method_name} conversion",
+        )
+
+        if target_dtype.endswith('float16'):
+            rtol = 1e-3
+            atol = 1e-3
+        else:
+            rtol = 1e-7
+            atol = 0
+
+        # Check the value after conversion
+        np.testing.assert_allclose(
+            converted_tensor.numpy(),
+            data_np.astype(target_dtype),
+            rtol=rtol,
+            atol=atol,
+            err_msg=f"Value mismatch after {method_name} conversion",
+        )
+
+    def test_method_chaining(self):
+        """Test method chaining for dtype conversions."""
+        tensor = paddle.to_tensor([1.5, 2.5, 3.5], dtype='float32')
+
+        # float32 -> int32 -> float64 -> int64
+        result = tensor.int32().float64().int64()
+        self.assertEqual(result.dtype, paddle.int64)
+
+    def test_pir_all_dtype_conversions(self):
+        """Test all dtype conversion methods for pir.Value in static graph."""
+        paddle.enable_static()
+        startup_prog = paddle.static.Program()
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, startup_prog):
+            for (
+                method_name,
+                target_dtype,
+            ) in self._supported_dtype_conversions.items():
+
+                if target_dtype == 'bfloat16':
+                    continue
+                for init_dtype in self._total_init_dtype:
+                    if (
+                        self._device.startswith('xpu')
+                        and target_dtype == 'complex128'
+                    ):
+                        self.skipTest(
+                            "Skipping complex conversion tests on XPU"
+                        )
+                    with self.subTest(
+                        pir_method=method_name,
+                        pir_init_dtype=init_dtype,
+                        pir_target_dtype=target_dtype,
+                    ):
+                        self._pir_single_dtype_conversion(
+                            method_name, init_dtype, target_dtype
+                        )
+
+    def _pir_single_dtype_conversion(
+        self, method_name, init_dtype, target_dtype
+    ):
+
+        # Create static graph input
+        x = paddle.static.data(name="x", shape=self.shape, dtype=init_dtype)
+        # Check if the method exists
+        self.assertTrue(
+            hasattr(x, method_name),
+            f"pir.Value should have method '{method_name}'",
+        )
+        # Perform dtype conversion
+        converted = getattr(x, method_name)()
+        # Check the dtype
+        expected_dtype = self._get_paddle_dtype(target_dtype)
+        self.assertEqual(
+            converted.dtype,
+            expected_dtype,
+            f"Expected pir.Value dtype {expected_dtype}, but got {converted.dtype} for method '{method_name}'",
+        )
+        # Check the shape
+        self.assertEqual(
+            tuple(x.shape),
+            tuple(converted.shape),
+            f"pir.Value shape should remain unchanged after {method_name} conversion",
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py
index a902b346432792..e5a9228219c7d1 100644
--- a/test/legacy_test/test_trace_op.py
+++ b/test/legacy_test/test_trace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, get_places
 
 import paddle
 from paddle import base, tensor
@@ -202,9 +202,7 @@ def test_case1(self):
 
 class TestTraceAPIZerodimCase(unittest.TestCase):
     def setUp(self):
-        self.places = [paddle.CPUPlace()]
-        if paddle.is_compiled_with_cuda():
-            self.places.append(paddle.CUDAPlace(0))
+        self.places = get_places()
         self.x = np.random.random([5, 0, 0, 0]).astype('float32')
 
     def test_dygraph(self):
diff --git a/test/legacy_test/test_transforms.py b/test/legacy_test/test_transforms.py
index a797c4eb7fe6a3..310df4f116104a 100644
--- a/test/legacy_test/test_transforms.py
+++ b/test/legacy_test/test_transforms.py
@@ -19,7 +19,7 @@
 
 import cv2
 import numpy as np
-from op_test import get_places
+from op_test import get_devices
 from PIL import Image
 
 import paddle
@@ -819,7 +819,7 @@ def test_color_jitter_sub_function(self):
         np_img_gray = (np.random.rand(28, 28, 1) * 255).astype('uint8')
         tensor_img_gray = F.to_tensor(np_img_gray)
 
-        places = get_places(string_format=True)
+        places = get_devices()
 
         def test_adjust_brightness(np_img, tensor_img):
             result_cv2 = np.array(F.adjust_brightness(np_img, 1.2))
@@ -956,7 +956,7 @@ def test_erase(self):
         np.testing.assert_equal(np.array(pil_result), expected)
 
         np_data = np.random.rand(3, 28, 28).astype('float32')
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             tensor_img = paddle.to_tensor(np_data)
             expected_tensor = tensor_img.clone()
diff --git a/test/legacy_test/test_uniform_random_inplace_op.py b/test/legacy_test/test_uniform_random_inplace_op.py
index 7424b5d982d452..5e560acdc7e9e5 100644
--- a/test/legacy_test/test_uniform_random_inplace_op.py
+++ b/test/legacy_test/test_uniform_random_inplace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_uint16_to_float, get_places
+from op_test import OpTest, convert_uint16_to_float, get_devices
 
 import paddle
 from paddle.base import core
@@ -44,7 +44,7 @@ def test_fp64():
             tensor_fp64.uniform_()
             self.assertEqual(tensor_fp64.dtype, paddle.float64)
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_fp32()
             test_fp64()
@@ -215,7 +215,7 @@ def test_attr_error():
 class TestUniformRandomInplaceOpEmptyTensor(unittest.TestCase):
     def test_uniform_random_inplace_op_empty_tensor(self):
         test_shapes = [(200, 0), (0, 200)]
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             for test_shape in test_shapes:
                 tensor = paddle.empty(shape=test_shape)
@@ -241,7 +241,7 @@ def test_grad():
             uniform_grad = tensor_b.grad.numpy()
             self.assertTrue((uniform_grad == 0).all())
 
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             test_grad()
 
diff --git a/test/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py
index ce47ce69d32e56..43fe75fed5810d 100644
--- a/test/legacy_test/test_uniform_random_op.py
+++ b/test/legacy_test/test_uniform_random_op.py
@@ -16,7 +16,12 @@
 
 import numpy as np
 from op import Operator
-from op_test import OpTest, convert_uint16_to_float, get_places
+from op_test import (
+    OpTest,
+    convert_uint16_to_float,
+    get_places,
+    is_custom_device,
+)
 
 import paddle
 from paddle import base
@@ -187,7 +192,8 @@ def test_check_api(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
 )
 class TestUniformRandomFP16Op(TestUniformRandomOp):
     def init_dtype(self):
diff --git a/test/legacy_test/test_zero_dim_no_backward_api.py b/test/legacy_test/test_zero_dim_no_backward_api.py
index f73d72ad4bcace..55d37af35e823e 100644
--- a/test/legacy_test/test_zero_dim_no_backward_api.py
+++ b/test/legacy_test/test_zero_dim_no_backward_api.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 from decorator_helper import prog_scope
-from op_test import get_places
+from op_test import get_devices
 
 import paddle
 
@@ -182,7 +182,7 @@ def test_one_hot_label(self):
         self.assertEqual(one_hot_label.numpy()[2], 1)
 
     def test_unique_consecutive(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             x = paddle.rand([])
             y, inverse, counts = paddle.unique_consecutive(
@@ -199,7 +199,7 @@ def test_unique_consecutive(self):
             self.assertEqual(counts.shape, [1])
 
     def test_unique(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
             x = paddle.rand([])
             y, index, inverse, counts = paddle.unique(
diff --git a/test/legacy_test/test_zero_dim_sundry_dygraph_api.py b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py
index c0e238bf3fb5f8..bc958ca42bf242 100644
--- a/test/legacy_test/test_zero_dim_sundry_dygraph_api.py
+++ b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py
@@ -21,7 +21,7 @@
 import unittest
 
 import numpy as np
-from op_test import get_device_place, get_places
+from op_test import get_device_place, get_devices
 
 import paddle
 import paddle.nn.functional as F
@@ -1691,7 +1691,7 @@ def test_lerp(self):
         self.assertEqual(y2.grad.shape, [])
 
     def test_repeat_interleave(self):
-        for place in get_places(string_format=True):
+        for place in get_devices():
             paddle.set_device(place)
 
             x = paddle.randn(())
diff --git a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index ff654a52c878a5..0761e4cf84e26d 100644
--- a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -208,12 +208,12 @@ def run_program(
         image_shape = [3, 224, 224]
         config = paddle.inference.Config(model_path)
         config.disable_gpu()
-        config.enable_mkldnn()
+        config.enable_onednn()
         config.switch_ir_optim()
         config.set_cpu_math_library_num_threads(1)
         config.disable_glog_info()
         if is_quantized_model:
-            config.enable_mkldnn_int8()
+            config.enable_onednn_int8()
         predictor = paddle.inference.create_predictor(config)
 
         input_names = predictor.get_input_names()
diff --git a/test/mkldnn/test_shape_mkldnn_op.py b/test/mkldnn/test_shape_mkldnn_op.py
index 4ae0e02b98f99e..1531980cda91da 100644
--- a/test/mkldnn/test_shape_mkldnn_op.py
+++ b/test/mkldnn/test_shape_mkldnn_op.py
@@ -26,7 +26,7 @@ def setUp(self):
         self.op_type = "shape"
         self.python_api = paddle.tensor.shape
         self.config()
-        self.attrs = {'use_mkldnn': True}
+        self.attrs = {'use_onednn': True}
         self.inputs = {'Input': np.zeros(self.shape).astype(self.dtype)}
         self.outputs = {'Out': np.array(self.shape)}
 
diff --git a/test/quantization/CMakeLists.txt b/test/quantization/CMakeLists.txt
index 20e2c49c9ea4a4..20082befcba268 100644
--- a/test/quantization/CMakeLists.txt
+++ b/test/quantization/CMakeLists.txt
@@ -179,7 +179,7 @@ function(inference_quant2_int8_lstm_model_test target fp32_model quant_model
          ${dataset_path}
          --num_threads
          1
-         --mkldnn_cache_capacity
+         --onednn_cache_capacity
          100
          --warmup_iter
          100
diff --git a/test/quantization/README.md b/test/quantization/README.md
index 4ab0a8fa06aff1..eeb4b838fe7648 100644
--- a/test/quantization/README.md
+++ b/test/quantization/README.md
@@ -28,7 +28,7 @@ A procedure on how to transform an FP32 model into a Quant model supported by th
 
 ## 3. How to turn a Quant model into an INT8 MKL-DNN model?
 
-A Quant model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Quant2Int8MkldnnPass` pass which comprises several steps:
+A Quant model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Quant2Int8OnednnPass` pass which comprises several steps:
 
 ### Gathering scales
 
@@ -88,12 +88,12 @@ Having gathered all the data needed for quantization we apply the `cpu_quantize_
 
 ## 4. Code example
 
-The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model graph:
+The code snipped shows how the `Quant2Int8OnednnPass` can be applied to a model graph:
 
 ```python
     import paddle
     import paddle.static as static
-    from paddle.static.quantization import Quant2Int8MkldnnPass
+    from paddle.static.quantization import Quant2Int8OnednnPass
     from paddle.base.framework import IrGraph
     from paddle.framework import core
 
@@ -101,10 +101,10 @@ The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model
     graph = IrGraph(core.Graph(static.Program().desc), for_test=False)
     place = paddle.CPUPlace()
     # Convert the IrGraph to MKL-DNN supported INT8 IrGraph using the
-    # Quant2Int8MkldnnPass. It requires a list of operators to be quantized
-    mkldnn_pass = Quant2Int8MkldnnPass({'conv2d', 'pool2d'}, static.global_scope(), place, core, False)
-    # Apply Quant2Int8MkldnnPass to IrGraph
-    mkldnn_pass.apply(graph)
+    # Quant2Int8OnednnPass. It requires a list of operators to be quantized
+    onednn_pass = Quant2Int8OnednnPass({'conv2d', 'pool2d'}, static.global_scope(), place, core, False)
+    # Apply Quant2Int8OnednnPass to IrGraph
+    onednn_pass.apply(graph)
 
 ```
 
@@ -273,7 +273,7 @@ OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/static/quantizatio
 
 To reproduce the performance results, the environment variable `OMP_NUM_THREADS=1` and `--batch_size=1` option should be set.
 
-1. Transform the Quant model into INT8 model by applying the `Quant2Int8MkldnnPass` pass and save the result. You can use the script `save_quant_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize.
+1. Transform the Quant model into INT8 model by applying the `Quant2Int8OnednnPass` pass and save the result. You can use the script `save_quant_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize.
 
    ```bash
    cd /PATH/TO/PADDLE/build
diff --git a/test/quantization/quant2_int8_image_classification_comparison.py b/test/quantization/quant2_int8_image_classification_comparison.py
index 7f6666c7b6a90d..edda63d5d0f532 100644
--- a/test/quantization/quant2_int8_image_classification_comparison.py
+++ b/test/quantization/quant2_int8_image_classification_comparison.py
@@ -25,7 +25,7 @@
 import paddle
 from paddle.base.framework import IrGraph
 from paddle.framework import core
-from paddle.static.quantization import Quant2Int8MkldnnPass
+from paddle.static.quantization import Quant2Int8OnednnPass
 
 paddle.enable_static()
 
@@ -211,7 +211,7 @@ def _predict(
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if self._debug:
                 graph.draw('.', 'quant_orig', graph.all_op_nodes())
-            quant_transform_pass = Quant2Int8MkldnnPass(
+            quant_transform_pass = Quant2Int8OnednnPass(
                 self._quantized_ops,
                 _op_ids_to_skip=self._op_ids_to_skip,
                 _scope=inference_scope,
diff --git a/test/quantization/quant2_int8_lstm_model.py b/test/quantization/quant2_int8_lstm_model.py
index 8cfa3ab04666e9..f7d8553ce38cab 100644
--- a/test/quantization/quant2_int8_lstm_model.py
+++ b/test/quantization/quant2_int8_lstm_model.py
@@ -49,7 +49,7 @@ def parse_args():
         '--num_threads', type=int, default=1, help='Number of threads.'
     )
     parser.add_argument(
-        '--mkldnn_cache_capacity',
+        '--onednn_cache_capacity',
         type=int,
         default=0,
         help='Mkldnn cache capacity. The default value in Python API is 15, which can slow down int8 models. Default 0 means unlimited cache.',
@@ -101,7 +101,7 @@ def set_config(
         self,
         model_path,
         num_threads,
-        mkldnn_cache_capacity,
+        onednn_cache_capacity,
         warmup_data=None,
         use_analysis=False,
         mode="fp32",
@@ -112,16 +112,16 @@ def set_config(
             config.disable_gpu()
             config.switch_use_feed_fetch_ops(True)
             config.switch_ir_optim(True)
-            config.enable_mkldnn()
-            config.disable_mkldnn_fc_passes()  # fc passes caused dnnl error
+            config.enable_onednn()
+            config.disable_onednn_fc_passes()  # fc passes caused dnnl error
             config.pass_builder().insert_pass(5, "fc_lstm_fuse_pass")
-            config.set_mkldnn_cache_capacity(mkldnn_cache_capacity)
+            config.set_onednn_cache_capacity(onednn_cache_capacity)
             if mode == "ptq":
                 config.enable_quantizer()
                 config.quantizer_config().set_quant_data(warmup_data)
                 config.quantizer_config().set_quant_batch_size(1)
             elif mode == "qat":
-                config.enable_mkldnn_int8()
+                config.enable_onednn_int8()
 
         return config
 
@@ -130,7 +130,7 @@ def run_program(
         model_path,
         data_path,
         num_threads,
-        mkldnn_cache_capacity,
+        onednn_cache_capacity,
         warmup_iter,
         use_analysis=False,
         mode="fp32",
@@ -141,7 +141,7 @@ def run_program(
         config = self.set_config(
             model_path,
             num_threads,
-            mkldnn_cache_capacity,
+            onednn_cache_capacity,
             warmup_data,
             use_analysis,
             mode,
@@ -216,7 +216,7 @@ def test_lstm_model(self):
             infer_data
         ), 'The dataset path cannot be empty. Please, use the --infer_data option.'
         num_threads = test_case_args.num_threads
-        mkldnn_cache_capacity = test_case_args.mkldnn_cache_capacity
+        onednn_cache_capacity = test_case_args.onednn_cache_capacity
         warmup_iter = test_case_args.warmup_iter
         acc_diff_threshold = test_case_args.acc_diff_threshold
 
@@ -224,7 +224,7 @@ def test_lstm_model(self):
             fp32_model,
             infer_data,
             num_threads,
-            mkldnn_cache_capacity,
+            onednn_cache_capacity,
             warmup_iter,
             False,
             mode="fp32",
@@ -234,7 +234,7 @@ def test_lstm_model(self):
             fp32_model,
             infer_data,
             num_threads,
-            mkldnn_cache_capacity,
+            onednn_cache_capacity,
             warmup_iter,
             True,
             mode="ptq",
@@ -244,7 +244,7 @@ def test_lstm_model(self):
             quant_model,
             infer_data,
             num_threads,
-            mkldnn_cache_capacity,
+            onednn_cache_capacity,
             warmup_iter,
             True,
             mode="qat",
diff --git a/test/quantization/quant2_int8_nlp_comparison.py b/test/quantization/quant2_int8_nlp_comparison.py
index bc2c0c4acbc66e..215441823f4a1c 100644
--- a/test/quantization/quant2_int8_nlp_comparison.py
+++ b/test/quantization/quant2_int8_nlp_comparison.py
@@ -158,9 +158,9 @@ def set_config(
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(True)
-        config.enable_mkldnn()
+        config.enable_onednn()
         if target == 'int8':
-            config.enable_mkldnn_int8(self._quantized_ops)
+            config.enable_onednn_int8(self._quantized_ops)
         config.delete_pass(
             "constant_folding_pass"
         )  # same reason as in analyzer_ernie_int8_tester.cc
diff --git a/test/quantization/quant_int8_image_classification_comparison.py b/test/quantization/quant_int8_image_classification_comparison.py
index f0944eb34b3afe..4fc176c45c0d43 100644
--- a/test/quantization/quant_int8_image_classification_comparison.py
+++ b/test/quantization/quant_int8_image_classification_comparison.py
@@ -25,7 +25,7 @@
 import paddle
 from paddle.base.framework import IrGraph
 from paddle.framework import core
-from paddle.static.quantization import QuantInt8MkldnnPass
+from paddle.static.quantization import QuantInt8OnednnPass
 
 paddle.enable_static()
 
@@ -190,10 +190,10 @@ def _predict(
             if self._debug:
                 graph.draw('.', 'quant_orig', graph.all_op_nodes())
             if transform_to_int8:
-                mkldnn_int8_pass = QuantInt8MkldnnPass(
+                onednn_int8_pass = QuantInt8OnednnPass(
                     _scope=inference_scope, _place=place
                 )
-                graph = mkldnn_int8_pass.apply(graph)
+                graph = onednn_int8_pass.apply(graph)
             else:
                 graph = self._prepare_for_fp32_mkldnn(graph)
 
diff --git a/test/xpu/test_xpu_stream_event.py b/test/xpu/test_xpu_stream_event.py
index 9bf1d21c5ee57e..b739bc9f7ad390 100644
--- a/test/xpu/test_xpu_stream_event.py
+++ b/test/xpu/test_xpu_stream_event.py
@@ -12,12 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import ctypes
 import unittest
 
+import numpy as np
+
 import paddle
 from paddle.device import xpu
 
 
+class TestCurrentStream(unittest.TestCase):
+    def test_current_stream(self):
+        if paddle.is_compiled_with_xpu():
+            s = xpu.current_stream()
+            self.assertTrue(isinstance(s, xpu.Stream))
+
+            s1 = xpu.current_stream(0)
+            self.assertTrue(isinstance(s1, xpu.Stream))
+
+            s2 = xpu.current_stream(paddle.XPUPlace(0))
+            self.assertTrue(isinstance(s2, xpu.Stream))
+            self.assertEqual(s1, s2)
+            self.assertRaises(ValueError, xpu.current_stream, "xpu:0")
+
+
 class TestSynchronize(unittest.TestCase):
     def test_synchronize(self):
         if paddle.is_compiled_with_xpu():
@@ -28,5 +46,120 @@ def test_synchronize(self):
             self.assertRaises(ValueError, xpu.synchronize, "xpu:0")
 
 
+class TestXPUStream(unittest.TestCase):
+    def test_xpu_stream(self):
+        if paddle.is_compiled_with_xpu():
+            s = paddle.device.xpu.Stream()
+            self.assertIsNotNone(s)
+
+    def test_xpu_stream_synchronize(self):
+        if paddle.is_compiled_with_xpu():
+            s = paddle.device.xpu.Stream()
+            e1 = paddle.device.xpu.Event()
+            e2 = paddle.device.xpu.Event()
+
+            e1.record(s)
+            e1.query()
+            tensor1 = paddle.to_tensor(paddle.rand([1000, 1000]))
+            tensor2 = paddle.matmul(tensor1, tensor1)
+            s.synchronize()
+            e2.record(s)
+            e2.synchronize()
+
+            self.assertTrue(e2.query())
+
+    def test_xpu_stream_wait_event_and_record_event(self):
+        if paddle.is_compiled_with_xpu():
+            s1 = xpu.Stream(0)
+            tensor1 = paddle.to_tensor(paddle.rand([1000, 1000]))
+            tensor2 = paddle.matmul(tensor1, tensor1)
+            e1 = xpu.Event()
+            s1.record_event(e1)
+
+            s2 = xpu.Stream(0)
+            s2.wait_event(e1)
+            s2.synchronize()
+
+            self.assertTrue(e1.query())
+
+
+class TestXPUEvent(unittest.TestCase):
+    def test_xpu_event(self):
+        if paddle.is_compiled_with_xpu():
+            e = paddle.device.xpu.Event()
+            self.assertIsNotNone(e)
+            s = paddle.device.xpu.current_stream()
+
+    def test_xpu_event_methods(self):
+        if paddle.is_compiled_with_xpu():
+            e = paddle.device.xpu.Event()
+            s = paddle.device.xpu.current_stream()
+            event_query_1 = e.query()
+            tensor1 = paddle.to_tensor(paddle.rand([1000, 1000]))
+            tensor2 = paddle.matmul(tensor1, tensor1)
+            s.record_event(e)
+            e.synchronize()
+            event_query_2 = e.query()
+
+            self.assertTrue(event_query_1)
+            self.assertTrue(event_query_2)
+
+
+class TestStreamGuard(unittest.TestCase):
+    '''
+    Note:
+        The asynchronous execution property of XPU Stream can only be tested offline.
+    '''
+
+    def test_stream_guard_normal(self):
+        if paddle.is_compiled_with_xpu():
+            s = paddle.device.Stream()
+            a = paddle.to_tensor(np.array([0, 2, 4], dtype="int32"))
+            b = paddle.to_tensor(np.array([1, 3, 5], dtype="int32"))
+            c = a + b
+            with paddle.device.stream_guard(s):
+                d = a + b
+                s.synchronize()
+
+            np.testing.assert_array_equal(np.array(c), np.array(d))
+
+    def test_stream_guard_default_stream(self):
+        if paddle.is_compiled_with_xpu():
+            s1 = paddle.device.current_stream()
+            with paddle.device.stream_guard(s1):
+                pass
+            s2 = paddle.device.current_stream()
+
+            self.assertTrue(id(s1.stream_base) == id(s2.stream_base))
+
+    def test_set_current_stream_default_stream(self):
+        if paddle.is_compiled_with_xpu():
+            cur_stream = paddle.device.current_stream()
+            new_stream = paddle.device.set_stream(cur_stream)
+
+            self.assertTrue(
+                id(cur_stream.stream_base) == id(new_stream.stream_base)
+            )
+
+    def test_stream_guard_raise_error(self):
+        if paddle.is_compiled_with_xpu():
+
+            def test_not_correct_stream_guard_input():
+                tmp = np.zeros(5)
+                with paddle.device.stream_guard(tmp):
+                    pass
+
+            self.assertRaises(TypeError, test_not_correct_stream_guard_input)
+
+
+class TestRawStream(unittest.TestCase):
+    def test_xpu_stream(self):
+        if paddle.is_compiled_with_xpu():
+            xpu_stream = paddle.device.xpu.current_stream().xpu_stream
+            print(xpu_stream)
+            self.assertTrue(type(xpu_stream) is int)
+            ptr = ctypes.c_void_p(xpu_stream)
+
+
 if __name__ == "__main__":
     unittest.main()